[llvm] e1094dd - [AMDGPU][DAG] Enable ganging up of memcpy loads/stores for AMDGPU (#96185)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 2 23:32:41 PDT 2024
Author: Fabian Ritter
Date: 2024-07-03T08:32:35+02:00
New Revision: e1094dd889c516da0c3181bf2be44ad631a84255
URL: https://github.com/llvm/llvm-project/commit/e1094dd889c516da0c3181bf2be44ad631a84255
DIFF: https://github.com/llvm/llvm-project/commit/e1094dd889c516da0c3181bf2be44ad631a84255.diff
LOG: [AMDGPU][DAG] Enable ganging up of memcpy loads/stores for AMDGPU (#96185)
In the SelectionDAG lowering of the memcpy intrinsic, this optimization
introduces additional chains between fixed-size groups of loads and the
corresponding stores. While initially introduced to ensure that wider
load/store-pair instructions are generated on AArch64, this optimization
also improves code generation for AMDGPU: Ganged loads are scheduled
into a clause; stores only await completion of their corresponding load.
The chosen value of 16 performed good in microbenchmarks, values of 8,
32, or 64 would perform similarly.
The testcase updates are autogenerated by
utils/update_llc_test_checks.py.
See also:
- PR introducing this optimization: https://reviews.llvm.org/D46477
Part of SWDEV-455845.
Added:
llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll
llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 747cedb111d31..ef30bf6d993fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -67,6 +67,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
+ // Enable ganging up loads and stores in the memcpy DAG lowering.
+ MaxGluedStoresPerMemcpy = 16;
+
// Lower floating point store/load to integer store/load to reduce the number
// of patterns in tablegen.
setOperationAction(ISD::LOAD, MVT::f32, Promote);
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index a118fa388f86d..645e48f1bb1ab 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -9074,8 +9074,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
@@ -9113,9 +9113,9 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
; GFX9-NEXT: v_writelane_b32 v40, s63, 31
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s63, v40, 31
; GFX9-NEXT: v_readlane_b32 s62, v40, 30
@@ -9167,17 +9167,17 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20
-; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
; GFX10-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
+; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_writelane_b32 v40, s35, 3
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
index a5e0ceaa6b329..343925528a520 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
@@ -8,22 +8,22 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
-; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
-; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off
+; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; MUBUF-NEXT: s_waitcnt vmcnt(2)
-; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32
+; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
; MUBUF-NEXT: s_waitcnt vmcnt(3)
-; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28
-; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24
-; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20
-; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16
+; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12
+; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8
+; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
+; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32
; MUBUF-NEXT: s_waitcnt vmcnt(6)
-; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:12
-; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8
-; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4
-; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32
+; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28
+; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24
+; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20
+; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; use v0
; MUBUF-NEXT: ;;#ASMEND
@@ -33,16 +33,16 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
; FLATSCR-LABEL: memcpy_fixed_align:
; FLATSCR: ; %bb.0:
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
-; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
-; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off
; FLATSCR-NEXT: v_mov_b32_e32 v0, s32
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32 offset:16
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 offset:16
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use v0
; FLATSCR-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 358f42dfe8dd5..ae1f31272a15f 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -13,148 +13,105 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: v_mov_b32_e32 v1, s3
; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14
; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16
+; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45
-; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16
+; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15
+; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32
+; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -165,20 +122,20 @@ define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addr
; CHECK-LABEL: memcpy_p1_p1_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_mov_b32_e32 v12, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32
+; CHECK-NEXT: global_load_dwordx2 v[10:11], v12, s[2:3] offset:39
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:39
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
@@ -189,32 +146,32 @@ define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr ad
; CHECK-LABEL: memcpy_p1_p4_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; CHECK-NEXT: v_mov_b32_e32 v32, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
@@ -231,394 +188,372 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s8, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1]
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1]
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: v_mov_b32_e32 v1, s2
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:13
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:10
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:9
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:8
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:7
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:6
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:5
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:2
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:1
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:4
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:3
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:23
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:22
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:21
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:20
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:19
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:18
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:16
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61
; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:26
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:25
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:24
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:44
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57
; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:43
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:45
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58
; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31
-; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33
-; CHECK-NEXT: s_waitcnt vmcnt(33)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34
-; CHECK-NEXT: s_waitcnt vmcnt(34)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
-; CHECK-NEXT: s_waitcnt vmcnt(34)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:36
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:35
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:47
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60
; CHECK-NEXT: s_waitcnt vmcnt(33)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
-; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:34
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:42
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:33
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:32
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:61
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:40
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:39
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52
; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:38
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:37
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:57
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:56
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:58
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:49
; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:48
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:46
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:60
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73
; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:41
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:55
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:74
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:53
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:52
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:51
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:63
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:50
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:77
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:71
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:70
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:59
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:73
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:54
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:68
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:66
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:65
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:64
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:62
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:76
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
-; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:90
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:72
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:87
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:67
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:79
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:93
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:75
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:89
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:78
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:92
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:88
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:86
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:85
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:84
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:83
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:82
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:81
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:80
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:107
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:106
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:105
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:102
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:101
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114
+; CHECK-NEXT: s_waitcnt vmcnt(34)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113
+; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:98
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:96
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:123
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:121
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:120
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:114
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:113
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -635,367 +570,362 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:2
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: s_waitcnt vmcnt(17)
-; CHECK-NEXT: flat_store_byte v[0:1], v3
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:19
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v18
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:37
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:36
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:35
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:34
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:33
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:32
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:29
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:44
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:63
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:42
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:40
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:39
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:38
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:41
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:59
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:51
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:50
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:49
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:48
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:46
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:61
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:43
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:58
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:79
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:56
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:54
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:53
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:52
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:55
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:73
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:65
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:64
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:62
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:77
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:60
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:75
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:57
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:72
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:95
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:70
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:68
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:67
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:66
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:87
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:111
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:110
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:91
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:74
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:89
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:71
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:86
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:84
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:83
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:81
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:80
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:78
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:93
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:82
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:101
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:90
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:105
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:88
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:103
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:85
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:100
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:92
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:104
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:102
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:99
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:94
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:109
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:121
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:123
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:124
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:116
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:119
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:114
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:115
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:125
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:113
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:112
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126
-; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -1008,30 +938,30 @@ define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 {
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:80
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
+; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:2 offset1:3
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
+; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:4 offset1:5
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
+; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:6 offset1:7
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
+; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:8 offset1:9
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
+; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:10 offset1:11
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
+; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:12 offset1:13
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
+; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:14 offset1:15
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
@@ -1043,298 +973,278 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:127
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:126
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:125
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:124
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:112
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:113
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:114
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:115
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:116
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:117
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:118
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:119
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:123
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:122
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:121
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119
; CHECK-NEXT: ds_read_u8 v3, v2 offset:120
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:118
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:121
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:122
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:123
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:124
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:125
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:126
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:127
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:117
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:115
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:114
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:112
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:111
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:110
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:108
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:107
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:104
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:103
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:102
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:101
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:96
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:97
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:98
; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:98
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:100
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:101
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:102
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:103
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:97
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:96
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:95
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:94
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:93
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:92
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:91
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:90
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:89
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:88
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:87
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:86
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:85
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:84
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:83
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:82
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:104
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:107
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:108
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:109
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:110
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:111
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111
; CHECK-NEXT: ds_read_u8 v3, v2 offset:80
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:79
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:78
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:81
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:82
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:83
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:84
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:85
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:86
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:87
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:77
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:76
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:75
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:74
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:72
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:71
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:70
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:69
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:68
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:67
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:88
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:89
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:90
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:91
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:92
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:93
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:94
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:95
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:64
; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:64
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:67
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:68
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:69
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:70
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:71
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:63
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:62
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:72
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:73
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:74
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:75
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:76
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:77
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:78
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:79
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:48
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:49
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:50
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:51
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:52
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:53
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:54
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:55
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:56
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:57
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:58
; CHECK-NEXT: ds_read_u8 v6, v2 offset:59
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:58
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:60
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:61
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:62
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:63
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:57
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:56
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:55
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:54
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:53
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:52
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:51
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:50
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:49
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:48
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:47
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:46
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:45
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:44
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:43
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:42
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:41
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:32
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:33
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:34
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:35
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:36
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:37
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:38
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:39
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39
; CHECK-NEXT: ds_read_u8 v3, v2 offset:40
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:39
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:38
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:41
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:42
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:43
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:44
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:45
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:46
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:47
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:37
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:36
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:35
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:34
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:33
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:32
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:31
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:29
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:27
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:26
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:25
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:23
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:22
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:21
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:20
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:19
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47
+; CHECK-NEXT: ds_read_u8 v3, v2
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:7
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:8
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:9
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:10
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:11
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:12
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:13
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:14
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:15
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:17
+; CHECK-NEXT: ds_read_u8 v21, v2 offset:18
+; CHECK-NEXT: ds_read_u8 v22, v2 offset:19
+; CHECK-NEXT: ds_read_u8 v23, v2 offset:20
+; CHECK-NEXT: ds_read_u8 v24, v2 offset:21
+; CHECK-NEXT: ds_read_u8 v25, v2 offset:22
+; CHECK-NEXT: ds_read_u8 v26, v2 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:17
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:8
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:9
-; CHECK-NEXT: ds_read_u8 v7, v2 offset:10
-; CHECK-NEXT: ds_read_u8 v8, v2 offset:11
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:12
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:13
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:14
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:15
-; CHECK-NEXT: ds_read_u8 v13, v2
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:1
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:2
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:3
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:4
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:5
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:6
-; CHECK-NEXT: ds_read_u8 v2, v2 offset:7
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8
-; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1
-; CHECK-NEXT: flat_store_byte v[0:1], v13
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:25
+; CHECK-NEXT: ds_read_u8 v21, v2 offset:26
+; CHECK-NEXT: ds_read_u8 v22, v2 offset:27
+; CHECK-NEXT: ds_read_u8 v23, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v24, v2 offset:29
+; CHECK-NEXT: ds_read_u8 v25, v2 offset:30
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:31
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v3
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
@@ -1349,148 +1259,105 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: v_mov_b32_e32 v1, s3
; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14
; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29
+; CHECK-NEXT: flat_store_byte v[2:3], v4
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16
+; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45
-; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16
+; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15
+; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32
+; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -1501,20 +1368,20 @@ define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addr
; CHECK-LABEL: memcpy_p1_p1_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_mov_b32_e32 v12, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32
+; CHECK-NEXT: global_load_dwordx2 v[10:11], v12, s[2:3] offset:39
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:39
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
@@ -1525,32 +1392,32 @@ define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr ad
; CHECK-LABEL: memcpy_p1_p4_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; CHECK-NEXT: v_mov_b32_e32 v32, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
@@ -1567,394 +1434,372 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s8, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1]
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1]
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: v_mov_b32_e32 v1, s2
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:11
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:10
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:9
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:8
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:7
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:6
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:5
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:2
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:4
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:3
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:23
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:22
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:21
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:20
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:19
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:18
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:17
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:16
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61
; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:26
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:25
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:24
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:44
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57
; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:43
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:45
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58
; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31
-; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33
-; CHECK-NEXT: s_waitcnt vmcnt(33)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34
-; CHECK-NEXT: s_waitcnt vmcnt(34)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
-; CHECK-NEXT: s_waitcnt vmcnt(34)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:36
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:35
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:47
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60
; CHECK-NEXT: s_waitcnt vmcnt(33)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
-; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:34
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:42
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:33
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:32
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:61
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:40
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:39
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52
; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:38
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:37
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:57
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:56
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:58
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:49
; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:48
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:46
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:60
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73
; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:41
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:55
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:74
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:53
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:52
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:51
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:63
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:50
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:77
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:71
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:70
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:59
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:73
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:54
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:68
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:66
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:65
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:64
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:62
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:76
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
-; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:90
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:72
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:87
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:67
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:79
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:93
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:75
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:89
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:78
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:92
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:88
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:86
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:85
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:84
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:83
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:82
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:81
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:80
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:107
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:106
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:105
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:102
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:101
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114
+; CHECK-NEXT: s_waitcnt vmcnt(34)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113
+; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:98
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:96
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:123
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:121
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:120
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:114
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:113
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -1971,367 +1816,362 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:2
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: s_waitcnt vmcnt(17)
-; CHECK-NEXT: flat_store_byte v[0:1], v3
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:19
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v18
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:37
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:36
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:35
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:34
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:33
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:32
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:29
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:44
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:63
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:42
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:40
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:39
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:38
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:41
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:59
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:51
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:50
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:49
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:48
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:46
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:61
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:43
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:58
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:79
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:56
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:54
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:53
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:52
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:55
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:73
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:65
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:64
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:62
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:77
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:60
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:75
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:57
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:72
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:95
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:70
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:68
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:67
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:66
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:87
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:111
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:110
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:91
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:74
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:89
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:71
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:86
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:84
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:83
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:81
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:80
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:78
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:93
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:82
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:101
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:90
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:105
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:88
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:103
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:85
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:100
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:92
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:104
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:102
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:99
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:94
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:109
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:121
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:123
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:124
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:116
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:119
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:114
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:115
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:125
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:113
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:112
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126
-; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -2344,30 +2184,30 @@ define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 {
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:80
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
+; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:2 offset1:3
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
+; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:4 offset1:5
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
+; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:6 offset1:7
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
+; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:8 offset1:9
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
+; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:10 offset1:11
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
+; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:12 offset1:13
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
+; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:14 offset1:15
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
@@ -2379,298 +2219,278 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:127
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:126
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:125
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:124
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:112
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:113
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:114
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:115
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:116
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:117
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:118
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:119
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:123
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:122
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:121
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119
; CHECK-NEXT: ds_read_u8 v3, v2 offset:120
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:118
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:121
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:122
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:123
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:124
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:125
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:126
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:127
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:117
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:115
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:114
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:112
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:111
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:110
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:108
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:107
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:104
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:103
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:102
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:101
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:96
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:97
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:98
; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:98
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:100
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:101
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:102
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:103
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:97
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:96
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:95
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:94
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:93
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:92
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:91
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:90
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:89
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:88
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:87
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:86
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:85
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:84
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:83
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:82
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:104
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:107
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:108
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:109
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:110
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:111
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111
; CHECK-NEXT: ds_read_u8 v3, v2 offset:80
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:79
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:78
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:81
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:82
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:83
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:84
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:85
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:86
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:87
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:77
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:76
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:75
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:74
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:72
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:71
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:70
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:69
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:68
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:67
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:88
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:89
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:90
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:91
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:92
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:93
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:94
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:95
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:64
; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:64
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:67
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:68
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:69
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:70
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:71
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:63
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:62
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:72
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:73
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:74
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:75
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:76
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:77
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:78
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:79
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:48
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:49
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:50
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:51
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:52
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:53
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:54
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:55
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:56
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:57
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:58
; CHECK-NEXT: ds_read_u8 v6, v2 offset:59
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:58
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:60
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:61
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:62
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:63
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:57
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:56
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:55
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:54
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:53
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:52
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:51
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:50
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:49
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:48
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:47
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:46
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:45
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:44
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:43
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:42
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:41
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:32
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:33
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:34
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:35
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:36
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:37
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:38
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:39
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39
; CHECK-NEXT: ds_read_u8 v3, v2 offset:40
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:39
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:38
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:41
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:42
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:43
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:44
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:45
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:46
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:47
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:37
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:36
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:35
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:34
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:33
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:32
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47
+; CHECK-NEXT: ds_read_u8 v3, v2
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:7
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:8
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:9
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:10
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:11
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:12
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:13
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:14
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:15
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:17
+; CHECK-NEXT: ds_read_u8 v21, v2 offset:18
+; CHECK-NEXT: ds_read_u8 v22, v2 offset:19
+; CHECK-NEXT: ds_read_u8 v23, v2 offset:20
+; CHECK-NEXT: ds_read_u8 v24, v2 offset:21
+; CHECK-NEXT: ds_read_u8 v25, v2 offset:22
+; CHECK-NEXT: ds_read_u8 v26, v2 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:29
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:27
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:26
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:25
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:23
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:22
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:21
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:20
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:19
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:18
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:17
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:8
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:9
-; CHECK-NEXT: ds_read_u8 v7, v2 offset:10
-; CHECK-NEXT: ds_read_u8 v8, v2 offset:11
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:12
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:13
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:14
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:15
-; CHECK-NEXT: ds_read_u8 v13, v2
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:1
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:2
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:3
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:4
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:5
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:6
-; CHECK-NEXT: ds_read_u8 v2, v2 offset:7
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8
-; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1
-; CHECK-NEXT: flat_store_byte v[0:1], v13
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:25
+; CHECK-NEXT: ds_read_u8 v21, v2 offset:26
+; CHECK-NEXT: ds_read_u8 v22, v2 offset:27
+; CHECK-NEXT: ds_read_u8 v23, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v24, v2 offset:29
+; CHECK-NEXT: ds_read_u8 v25, v2 offset:30
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:31
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v3
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
new file mode 100644
index 0000000000000..7575782c1b2ac
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
@@ -0,0 +1,9296 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s
+
+; Testing codegen for memcpy with vector operands for all combinations of the following parameters:
+; destination address space: 0, 1, 3, 5
+; source address space: 0, 1, 3, 4, 5
+; alignment: 1, 2, 8, 16
+; sizes: 16, 31, 32
+
+
+define void @memcpy_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xe
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v18
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:17
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v19
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:31
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:17
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x10
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:17
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:28
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(16)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x10
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:17
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:28
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(16)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p0_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p0_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xe
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v18
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v19, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v19
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:19
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v10
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v10
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v11
+; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v11
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:19
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v10
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v10
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v11
+; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v11
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p1_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p1_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:15
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:14
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:13
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:12
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:11
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:10
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:9
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:8
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:7
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v2, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:14
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:13
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:12
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:11
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:10
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:9
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:8
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:7
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v17, v2
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:15
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:17
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:11
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:9
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:5
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v17
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:29
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:27
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:26
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:25
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:23
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:22
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:21
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:20
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:19
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:18
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:15
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:14
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:13
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:12
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:11
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:10
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:9
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:8
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:7
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v18, v2
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:17
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v18
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:31
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:30
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:29
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:27
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:26
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:25
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:23
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:22
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:21
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:20
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:19
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:18
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u16 v3, v2 offset:14
+; CHECK-NEXT: ds_read_u16 v4, v2 offset:12
+; CHECK-NEXT: ds_read_u16 v5, v2 offset:10
+; CHECK-NEXT: ds_read_u16 v6, v2 offset:8
+; CHECK-NEXT: ds_read_u16 v7, v2 offset:6
+; CHECK-NEXT: ds_read_u16 v8, v2 offset:4
+; CHECK-NEXT: ds_read_u16 v9, v2 offset:2
+; CHECK-NEXT: ds_read_u16 v2, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
+; CHECK-NEXT: ds_read_u16 v4, v2 offset:28
+; CHECK-NEXT: ds_read_u16 v5, v2 offset:26
+; CHECK-NEXT: ds_read_u16 v6, v2 offset:24
+; CHECK-NEXT: ds_read_u16 v7, v2 offset:22
+; CHECK-NEXT: ds_read_u16 v8, v2 offset:20
+; CHECK-NEXT: ds_read_u16 v9, v2 offset:18
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:16
+; CHECK-NEXT: ds_read_u16 v11, v2 offset:14
+; CHECK-NEXT: ds_read_u16 v12, v2 offset:12
+; CHECK-NEXT: ds_read_u16 v13, v2 offset:10
+; CHECK-NEXT: ds_read_u16 v14, v2 offset:8
+; CHECK-NEXT: ds_read_u16 v15, v2 offset:6
+; CHECK-NEXT: ds_read_u16 v16, v2 offset:4
+; CHECK-NEXT: ds_read_u16 v17, v2 offset:2
+; CHECK-NEXT: ds_read_u16 v2, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u16 v3, v2 offset:30
+; CHECK-NEXT: ds_read_u16 v4, v2 offset:28
+; CHECK-NEXT: ds_read_u16 v5, v2 offset:26
+; CHECK-NEXT: ds_read_u16 v6, v2 offset:24
+; CHECK-NEXT: ds_read_u16 v7, v2 offset:22
+; CHECK-NEXT: ds_read_u16 v8, v2 offset:20
+; CHECK-NEXT: ds_read_u16 v9, v2 offset:18
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:16
+; CHECK-NEXT: ds_read_u16 v11, v2 offset:14
+; CHECK-NEXT: ds_read_u16 v12, v2 offset:12
+; CHECK-NEXT: ds_read_u16 v13, v2 offset:10
+; CHECK-NEXT: ds_read_u16 v14, v2 offset:8
+; CHECK-NEXT: ds_read_u16 v15, v2 offset:6
+; CHECK-NEXT: ds_read_u16 v16, v2 offset:4
+; CHECK-NEXT: ds_read_u16 v17, v2 offset:2
+; CHECK-NEXT: ds_read_u16 v2, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:19
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v10
+; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v10
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:19
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v10
+; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v10
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p3_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p3_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:2
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:3
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:4
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:5
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:6
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:7
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:9
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:10
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:11
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:12
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v19, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v19
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v4
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:2
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:4
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:6
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:8
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:10
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v3 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:19
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:23
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v2
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v3
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v3
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v4
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v4
+; CHECK-NEXT: v_lshrrev_b32_e32 v9, 24, v5
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v5
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v3 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:19
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:23
+; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v2
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v2
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v3
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v3
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v4
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v4
+; CHECK-NEXT: v_lshrrev_b32_e32 v9, 24, v5
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v5
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p4_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p4_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x11
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v17
+; CHECK-NEXT: s_clause 0xc
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x11
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v18
+; CHECK-NEXT: s_clause 0xd
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:19
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:23
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:19
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:23
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p0_p5_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p0_p5_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3]
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10
+; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5
+; CHECK-NEXT: v_lshl_or_b32 v5, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v6, v11, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v8, v12, 8, v13
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15
+; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v10, v18, 8, v2
+; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5
+; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7
+; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17
+; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v32, v[2:3]
+; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25)
+; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v9
+; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5
+; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v6, v10, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12
+; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19)
+; CHECK-NEXT: v_lshl_or_b32 v5, v7, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v16
+; CHECK-NEXT: v_lshl_or_b32 v7, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v18
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20
+; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v10, v21, 8, v22
+; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v14, v23, 8, v24
+; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v17, v27, 8, v28
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v30
+; CHECK-NEXT: v_lshl_or_b32 v4, v14, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v32
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v2, v33, 8, v2
+; CHECK-NEXT: v_lshl_or_b32 v3, v17, 16, v16
+; CHECK-NEXT: v_lshl_or_b32 v2, v2, 16, v18
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:31
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:17
+; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v33, v[2:3]
+; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25)
+; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10
+; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5
+; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13
+; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15
+; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v8, v18, 8, v19
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v10, v22, 8, v23
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v15, v24, 8, v25
+; CHECK-NEXT: v_lshl_or_b32 v9, v12, 16, v14
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v13, v26, 8, v27
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v17, v28, 8, v29
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v33
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v19, v34, 8, v2
+; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16
+; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v5, v[2:3]
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v7
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ushort v19, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ushort v20, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v21, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v22, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v23, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v8, v8, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: v_lshl_or_b32 v7, v4, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14
+; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v13, v15, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v4, v19, 16, v20
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v3, v21, 16, v22
+; CHECK-NEXT: v_lshl_or_b32 v9, v13, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v2, v23, 16, v2
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v19, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v4, v6, 16, v9
+; CHECK-NEXT: v_lshl_or_b32 v9, v7, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v3, v12, 16, v13
+; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v11
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v2, v14, 16, v15
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v7, v16, 16, v17
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v6, v18, 16, v19
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x10
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v10, v9, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14
+; CHECK-NEXT: v_lshl_or_b32 v8, v7, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v14, v15, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18
+; CHECK-NEXT: v_lshl_or_b32 v9, v11, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v21
+; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v12
+; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v15
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x10
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v10, v9, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14
+; CHECK-NEXT: v_lshl_or_b32 v8, v7, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v14, v15, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18
+; CHECK-NEXT: v_lshl_or_b32 v9, v11, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v21
+; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v12
+; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v15
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p0_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p0_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p1_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p1_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b64 v[7:8], v2
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:8
+; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b64 v[7:8], v2
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:8
+; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1
+; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p3_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p3_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:8
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:23
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:8
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:23
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v6, v12, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13
+; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v11, v2, 8, v17
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13
+; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v21
+; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v32
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15
+; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16
+; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13
+; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29
+; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v31
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v33
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15
+; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16
+; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v19, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v20, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v21, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v22, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v12, v18, 8, v17
+; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v7, v20, 16, v19
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v6, v22, 16, v21
+; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v9, v16, 16, v15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v8, v18, 16, v17
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v2, v10, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v11, v12, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v12, v14, 8, v13
+; CHECK-NEXT: v_lshl_or_b32 v10, v2, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v8, v16, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v13, v20, 8, v19
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21
+; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8
+; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11
+; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v2, v10, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v11, v12, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v12, v14, 8, v13
+; CHECK-NEXT: v_lshl_or_b32 v10, v2, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v8, v16, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v13, v20, 8, v19
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21
+; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8
+; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11
+; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p5_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p1_p5_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2]
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9
+; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v12
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v6, v13, 8, v14
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v1
+; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17
+; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v31, v[1:2]
+; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v8
+; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4
+; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v5, v9, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13
+; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19)
+; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v17
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v19
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v8, v20, 8, v21
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v13, v22, 8, v23
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v11, v24, 8, v25
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v15, v26, 8, v27
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v14, v28, 8, v29
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v17, v32, 8, v1
+; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT: v_lshl_or_b32 v6, v10, 16, v12
+; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v11
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23
+; CHECK-NEXT: ds_write_b64 v0, v[3:4] offset:16
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:31
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:17
+; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v32, v[1:2]
+; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9
+; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12
+; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v6, v17, 8, v18
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v9, v19, 8, v20
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v8, v21, 8, v22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v13, v23, 8, v24
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v14, v29, 8, v30
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v16, v31, 8, v32
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v17, v33, 8, v1
+; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT: v_lshl_or_b32 v6, v11, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v4, v[1:2]
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v1, v8, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v6
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ushort v18, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v19, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v20, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v21, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v22, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v1, v10, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12
+; CHECK-NEXT: v_lshl_or_b32 v11, v5, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v16
+; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v3, v17, 16, v18
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v20
+; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v5, v21, 16, v22
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16
+; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v18, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v3, v13, 16, v14
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v18
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x10
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13
+; CHECK-NEXT: v_lshl_or_b32 v7, v6, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v13, v14, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v17
+; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v14, v18, 8, v19
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v20
+; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v11
+; CHECK-NEXT: v_lshl_or_b32 v5, v5, 16, v14
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: ds_write_b128 v0, v[5:8] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x10
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13
+; CHECK-NEXT: v_lshl_or_b32 v7, v6, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v13, v14, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v17
+; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v14, v18, 8, v19
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v20
+; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v11
+; CHECK-NEXT: v_lshl_or_b32 v5, v5, 16, v14
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: ds_write_b128 v0, v[5:8] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p0_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p0_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p1_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p1_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b64 v[5:6], v1 offset:23
+; CHECK-NEXT: ds_read_b64 v[7:8], v1 offset:16
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset0:2 offset1:3
+; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b64 v[5:6], v1 offset:23
+; CHECK-NEXT: ds_read_b64 v[7:8], v1 offset:16
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset0:2 offset1:3
+; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT: ds_read_b128 v[6:9], v1 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[6:9] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset0:2 offset1:3
+; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[1:4], v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v1 offset:15
+; CHECK-NEXT: ds_read_b128 v[6:9], v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[2:5] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[6:9]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p3_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p3_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v1 offset:16
+; CHECK-NEXT: ds_read_b128 v[6:9], v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[2:5] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[6:9]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p4_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p4_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v5, v11, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12
+; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v10, v1, 8, v16
+; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5
+; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12
+; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v20
+; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v31
+; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5
+; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9
+; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14
+; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15
+; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12
+; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28
+; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v32
+; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5
+; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9
+; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14
+; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15
+; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v18, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v19, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v20, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v21, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v10, v15, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v11, v17, 8, v16
+; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v5, v21, 16, v20
+; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v10
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v2
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v1, v9, 8, v4
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v10, v11, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v11, v13, 8, v12
+; CHECK-NEXT: v_lshl_or_b32 v4, v1, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v3, v15, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v16
+; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v18
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v3
+; CHECK-NEXT: v_lshl_or_b32 v1, v13, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset1:1
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v1, v9, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v10, v11, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v11, v13, 8, v12
+; CHECK-NEXT: v_lshl_or_b32 v9, v1, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v18
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20
+; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v7
+; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[2:5]
+; CHECK-NEXT: ds_write_b128 v0, v[6:9] offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p3_p5_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p3_p5_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: ds_write_b128 v0, v[2:5]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[6:9] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xe
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:17
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:31
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:17
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v1, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v1, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v1, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x10
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:17
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:28
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x10
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:17
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:28
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p0_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p0_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:29
+; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:27
+; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:25
+; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:23
+; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:21
+; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:19
+; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:17
+; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v18, v[1:2], off
+; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:31
+; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:29
+; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:27
+; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:25
+; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:23
+; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:21
+; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:19
+; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:17
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: global_load_ushort v3, v[1:2], off
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p1_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v2, v1 offset:15
+; CHECK-NEXT: ds_read_u8 v3, v1 offset:14
+; CHECK-NEXT: ds_read_u8 v4, v1 offset:13
+; CHECK-NEXT: ds_read_u8 v5, v1 offset:12
+; CHECK-NEXT: ds_read_u8 v6, v1 offset:11
+; CHECK-NEXT: ds_read_u8 v7, v1 offset:10
+; CHECK-NEXT: ds_read_u8 v8, v1 offset:9
+; CHECK-NEXT: ds_read_u8 v9, v1 offset:8
+; CHECK-NEXT: ds_read_u8 v10, v1 offset:7
+; CHECK-NEXT: ds_read_u8 v11, v1 offset:6
+; CHECK-NEXT: ds_read_u8 v12, v1 offset:5
+; CHECK-NEXT: ds_read_u8 v13, v1 offset:4
+; CHECK-NEXT: ds_read_u8 v14, v1 offset:3
+; CHECK-NEXT: ds_read_u8 v15, v1 offset:2
+; CHECK-NEXT: ds_read_u8 v16, v1 offset:1
+; CHECK-NEXT: ds_read_u8 v1, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v2, v1
+; CHECK-NEXT: ds_read_u8 v3, v1 offset:1
+; CHECK-NEXT: ds_read_u8 v4, v1 offset:2
+; CHECK-NEXT: ds_read_u8 v5, v1 offset:3
+; CHECK-NEXT: ds_read_u8 v6, v1 offset:4
+; CHECK-NEXT: ds_read_u8 v7, v1 offset:5
+; CHECK-NEXT: ds_read_u8 v8, v1 offset:6
+; CHECK-NEXT: ds_read_u8 v9, v1 offset:7
+; CHECK-NEXT: ds_read_u8 v10, v1 offset:8
+; CHECK-NEXT: ds_read_u8 v11, v1 offset:9
+; CHECK-NEXT: ds_read_u8 v12, v1 offset:10
+; CHECK-NEXT: ds_read_u8 v13, v1 offset:11
+; CHECK-NEXT: ds_read_u8 v14, v1 offset:12
+; CHECK-NEXT: ds_read_u8 v15, v1 offset:13
+; CHECK-NEXT: ds_read_u8 v16, v1 offset:14
+; CHECK-NEXT: ds_read_u8 v17, v1 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: ds_read_u8 v2, v1 offset:24
+; CHECK-NEXT: ds_read_u8 v3, v1 offset:25
+; CHECK-NEXT: ds_read_u8 v4, v1 offset:26
+; CHECK-NEXT: ds_read_u8 v18, v1 offset:27
+; CHECK-NEXT: ds_read_u8 v19, v1 offset:28
+; CHECK-NEXT: ds_read_u8 v20, v1 offset:29
+; CHECK-NEXT: ds_read_u8 v21, v1 offset:30
+; CHECK-NEXT: ds_read_u8 v22, v1 offset:16
+; CHECK-NEXT: ds_read_u8 v23, v1 offset:17
+; CHECK-NEXT: ds_read_u8 v24, v1 offset:18
+; CHECK-NEXT: ds_read_u8 v25, v1 offset:19
+; CHECK-NEXT: ds_read_u8 v26, v1 offset:20
+; CHECK-NEXT: ds_read_u8 v27, v1 offset:21
+; CHECK-NEXT: ds_read_u8 v28, v1 offset:22
+; CHECK-NEXT: ds_read_u8 v1, v1 offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(27)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(26)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(25)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt lgkmcnt(24)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(23)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(22)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(21)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt lgkmcnt(20)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(19)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt lgkmcnt(18)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(17)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt lgkmcnt(16)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v2, v1 offset:15
+; CHECK-NEXT: ds_read_u8 v3, v1 offset:14
+; CHECK-NEXT: ds_read_u8 v4, v1 offset:13
+; CHECK-NEXT: ds_read_u8 v5, v1 offset:12
+; CHECK-NEXT: ds_read_u8 v6, v1 offset:11
+; CHECK-NEXT: ds_read_u8 v7, v1 offset:8
+; CHECK-NEXT: ds_read_u8 v8, v1 offset:9
+; CHECK-NEXT: ds_read_u8 v9, v1 offset:10
+; CHECK-NEXT: ds_read_u8 v10, v1
+; CHECK-NEXT: ds_read_u8 v11, v1 offset:1
+; CHECK-NEXT: ds_read_u8 v12, v1 offset:2
+; CHECK-NEXT: ds_read_u8 v13, v1 offset:3
+; CHECK-NEXT: ds_read_u8 v14, v1 offset:4
+; CHECK-NEXT: ds_read_u8 v15, v1 offset:5
+; CHECK-NEXT: ds_read_u8 v16, v1 offset:6
+; CHECK-NEXT: ds_read_u8 v17, v1 offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: ds_read_u8 v2, v1 offset:24
+; CHECK-NEXT: ds_read_u8 v3, v1 offset:25
+; CHECK-NEXT: ds_read_u8 v4, v1 offset:26
+; CHECK-NEXT: ds_read_u8 v5, v1 offset:27
+; CHECK-NEXT: ds_read_u8 v6, v1 offset:28
+; CHECK-NEXT: ds_read_u8 v18, v1 offset:29
+; CHECK-NEXT: ds_read_u8 v19, v1 offset:30
+; CHECK-NEXT: ds_read_u8 v20, v1 offset:31
+; CHECK-NEXT: ds_read_u8 v21, v1 offset:16
+; CHECK-NEXT: ds_read_u8 v22, v1 offset:17
+; CHECK-NEXT: ds_read_u8 v23, v1 offset:18
+; CHECK-NEXT: ds_read_u8 v24, v1 offset:19
+; CHECK-NEXT: ds_read_u8 v25, v1 offset:20
+; CHECK-NEXT: ds_read_u8 v26, v1 offset:21
+; CHECK-NEXT: ds_read_u8 v27, v1 offset:22
+; CHECK-NEXT: ds_read_u8 v1, v1 offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(24)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(16)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:31
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u16 v2, v1
+; CHECK-NEXT: ds_read_u16 v3, v1 offset:2
+; CHECK-NEXT: ds_read_u16 v4, v1 offset:4
+; CHECK-NEXT: ds_read_u16 v5, v1 offset:6
+; CHECK-NEXT: ds_read_u16 v6, v1 offset:8
+; CHECK-NEXT: ds_read_u16 v7, v1 offset:10
+; CHECK-NEXT: ds_read_u16 v8, v1 offset:12
+; CHECK-NEXT: ds_read_u16 v1, v1 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v2, v1 offset:30
+; CHECK-NEXT: ds_read_u16 v3, v1 offset:28
+; CHECK-NEXT: ds_read_u16 v4, v1 offset:26
+; CHECK-NEXT: ds_read_u16 v5, v1 offset:24
+; CHECK-NEXT: ds_read_u16 v6, v1 offset:22
+; CHECK-NEXT: ds_read_u16 v7, v1 offset:20
+; CHECK-NEXT: ds_read_u16 v8, v1 offset:18
+; CHECK-NEXT: ds_read_u16 v9, v1 offset:16
+; CHECK-NEXT: ds_read_u16 v10, v1 offset:14
+; CHECK-NEXT: ds_read_u16 v11, v1 offset:12
+; CHECK-NEXT: ds_read_u16 v12, v1 offset:10
+; CHECK-NEXT: ds_read_u16 v13, v1 offset:8
+; CHECK-NEXT: ds_read_u16 v14, v1 offset:6
+; CHECK-NEXT: ds_read_u16 v15, v1 offset:4
+; CHECK-NEXT: ds_read_u16 v16, v1 offset:2
+; CHECK-NEXT: ds_read_u16 v1, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(14)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(11)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt lgkmcnt(10)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(9)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u16 v2, v1 offset:30
+; CHECK-NEXT: ds_read_u16 v3, v1 offset:28
+; CHECK-NEXT: ds_read_u16 v4, v1 offset:26
+; CHECK-NEXT: ds_read_u16 v5, v1 offset:24
+; CHECK-NEXT: ds_read_u16 v6, v1 offset:22
+; CHECK-NEXT: ds_read_u16 v7, v1 offset:20
+; CHECK-NEXT: ds_read_u16 v8, v1 offset:18
+; CHECK-NEXT: ds_read_u16 v9, v1 offset:16
+; CHECK-NEXT: ds_read_u16 v10, v1 offset:14
+; CHECK-NEXT: ds_read_u16 v11, v1 offset:12
+; CHECK-NEXT: ds_read_u16 v12, v1 offset:10
+; CHECK-NEXT: ds_read_u16 v13, v1 offset:8
+; CHECK-NEXT: ds_read_u16 v14, v1 offset:6
+; CHECK-NEXT: ds_read_u16 v15, v1 offset:4
+; CHECK-NEXT: ds_read_u16 v16, v1 offset:2
+; CHECK-NEXT: ds_read_u16 v1, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(14)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(11)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt lgkmcnt(10)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(9)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT: ds_read_b128 v[6:9], v1 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v6
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v6
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v9
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[1:4], v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v1
+; CHECK-NEXT: ds_read_b128 v[6:9], v1 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v6
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v6
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v9
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p3_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p3_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v1
+; CHECK-NEXT: ds_read_b128 v[6:9], v1 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:29
+; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:27
+; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:25
+; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:23
+; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:21
+; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:19
+; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:17
+; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v18, v[1:2], off
+; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:31
+; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:29
+; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:27
+; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:25
+; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:23
+; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:21
+; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:19
+; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:17
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: global_load_ushort v3, v[1:2], off
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8
+; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9
+; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10
+; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p4_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p4_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x11
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_clause 0xc
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x11
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_clause 0xd
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x13
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p5_p5_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p5_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p0.p1.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p0.p3.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p0.p4.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p0.p5.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p3.p5.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll
new file mode 100644
index 0000000000000..f60728c16a3ae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-scalar-load.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s
+
+; Testing codegen for memcpy with scalar reads.
+
+
+define void @memcpy_p1_p4_sz16_align_4_4(ptr addrspace(1) align 4 %dst, ptr addrspace(4) align 4 readonly inreg %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz16_align_4_4:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: v_mov_b32_e32 v3, s5
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 4 %dst, ptr addrspace(4) noundef nonnull align 4 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr addrspace(4) align 4 readonly inreg %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz31_align_4_4:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s8
+; CHECK-NEXT: v_mov_b32_e32 v3, s9
+; CHECK-NEXT: v_mov_b32_e32 v4, s10
+; CHECK-NEXT: v_mov_b32_e32 v5, s11
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v6, s[4:5] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:15
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 4 %dst, ptr addrspace(4) noundef nonnull align 4 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memcpy_p1_p4_sz32_align_4_4(ptr addrspace(1) align 4 %dst, ptr addrspace(4) align 4 readonly inreg %src) {
+; CHECK-LABEL: memcpy_p1_p4_sz32_align_4_4:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: v_mov_b32_e32 v3, s5
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: v_mov_b32_e32 v6, s8
+; CHECK-NEXT: v_mov_b32_e32 v7, s9
+; CHECK-NEXT: v_mov_b32_e32 v8, s10
+; CHECK-NEXT: v_mov_b32_e32 v9, s11
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef nonnull align 4 %dst, ptr addrspace(4) noundef nonnull align 4 %src, i64 32, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
new file mode 100644
index 0000000000000..cc5256620bfe0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -0,0 +1,8698 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s
+
+; Testing codegen for memmove with vector operands for all combinations of the following parameters:
+; destination address space: 0, 1, 3, 5
+; source address space: 0, 1, 3, 4, 5
+; alignment: 1, 2, 8, 16
+; sizes: 16, 31, 32
+
+
+define void @memmove_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:17
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:31
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:17
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(31) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v2, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x4
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:20
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30
+; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
+; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
+; CHECK-NEXT: flat_load_dword v2, v[2:3] offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p0_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p0_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:15
+; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17
+; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:15
+; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v34, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x4
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:20
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
+; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p1_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:15
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:14
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:13
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:12
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:11
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:10
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:9
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:8
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:7
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v2, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:25
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:26
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:27
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:29
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:17
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:18
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:19
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:20
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:21
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:22
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:23
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:8
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:9
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:10
+; CHECK-NEXT: ds_read_u8 v21, v2 offset:11
+; CHECK-NEXT: ds_read_u8 v22, v2 offset:12
+; CHECK-NEXT: ds_read_u8 v23, v2 offset:13
+; CHECK-NEXT: ds_read_u8 v24, v2 offset:14
+; CHECK-NEXT: ds_read_u8 v25, v2 offset:15
+; CHECK-NEXT: ds_read_u8 v26, v2
+; CHECK-NEXT: ds_read_u8 v27, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v28, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v29, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v30, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v31, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v32, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(23)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:23
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(23)
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:15
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:9
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(23)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7
+; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:6
+; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:5
+; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:3
+; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:2
+; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:1
+; CHECK-NEXT: flat_store_byte v[0:1], v26
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:25
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:26
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:27
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:29
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:31
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:17
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:18
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:19
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:20
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:21
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:22
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:23
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:8
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:9
+; CHECK-NEXT: ds_read_u8 v21, v2 offset:10
+; CHECK-NEXT: ds_read_u8 v22, v2 offset:11
+; CHECK-NEXT: ds_read_u8 v23, v2 offset:12
+; CHECK-NEXT: ds_read_u8 v24, v2 offset:13
+; CHECK-NEXT: ds_read_u8 v25, v2 offset:14
+; CHECK-NEXT: ds_read_u8 v26, v2 offset:15
+; CHECK-NEXT: ds_read_u8 v27, v2
+; CHECK-NEXT: ds_read_u8 v28, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v29, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v30, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v31, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v32, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v33, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:23
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:15
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:9
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7
+; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:6
+; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:5
+; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4
+; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:3
+; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:2
+; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:1
+; CHECK-NEXT: flat_store_byte v[0:1], v27
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u16 v3, v2 offset:14
+; CHECK-NEXT: ds_read_u16 v4, v2 offset:12
+; CHECK-NEXT: ds_read_u16 v5, v2 offset:10
+; CHECK-NEXT: ds_read_u16 v6, v2 offset:8
+; CHECK-NEXT: ds_read_u16 v7, v2 offset:6
+; CHECK-NEXT: ds_read_u16 v8, v2 offset:4
+; CHECK-NEXT: ds_read_u16 v9, v2 offset:2
+; CHECK-NEXT: ds_read_u16 v2, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
+; CHECK-NEXT: ds_read_u16 v4, v2 offset:28
+; CHECK-NEXT: ds_read_u16 v5, v2 offset:26
+; CHECK-NEXT: ds_read_u16 v6, v2 offset:24
+; CHECK-NEXT: ds_read_u16 v7, v2 offset:22
+; CHECK-NEXT: ds_read_u16 v8, v2 offset:20
+; CHECK-NEXT: ds_read_u16 v9, v2 offset:18
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:16
+; CHECK-NEXT: ds_read_u16 v11, v2 offset:14
+; CHECK-NEXT: ds_read_u16 v12, v2 offset:12
+; CHECK-NEXT: ds_read_u16 v13, v2 offset:10
+; CHECK-NEXT: ds_read_u16 v14, v2 offset:8
+; CHECK-NEXT: ds_read_u16 v15, v2 offset:6
+; CHECK-NEXT: ds_read_u16 v16, v2 offset:4
+; CHECK-NEXT: ds_read_u16 v17, v2 offset:2
+; CHECK-NEXT: ds_read_u16 v2, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u16 v3, v2 offset:30
+; CHECK-NEXT: ds_read_u16 v4, v2 offset:28
+; CHECK-NEXT: ds_read_u16 v5, v2 offset:26
+; CHECK-NEXT: ds_read_u16 v6, v2 offset:24
+; CHECK-NEXT: ds_read_u16 v7, v2 offset:22
+; CHECK-NEXT: ds_read_u16 v8, v2 offset:20
+; CHECK-NEXT: ds_read_u16 v9, v2 offset:18
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:16
+; CHECK-NEXT: ds_read_u16 v11, v2 offset:14
+; CHECK-NEXT: ds_read_u16 v12, v2 offset:12
+; CHECK-NEXT: ds_read_u16 v13, v2 offset:10
+; CHECK-NEXT: ds_read_u16 v14, v2 offset:8
+; CHECK-NEXT: ds_read_u16 v15, v2 offset:6
+; CHECK-NEXT: ds_read_u16 v16, v2 offset:4
+; CHECK-NEXT: ds_read_u16 v17, v2 offset:2
+; CHECK-NEXT: ds_read_u16 v2, v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b32 v[6:7], v2 offset0:4 offset1:5
+; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b32 v[6:7], v2 offset0:4 offset1:5
+; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p3_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p3.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:15
+; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31
+; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29
+; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23
+; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21
+; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19
+; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17
+; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:15
+; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:13
+; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:11
+; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:9
+; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:7
+; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:5
+; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:3
+; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ubyte v34, v[2:3], off offset:1
+; CHECK-NEXT: global_load_ubyte v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28
+; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26
+; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22
+; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18
+; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14
+; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12
+; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10
+; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8
+; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6
+; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4
+; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2
+; CHECK-NEXT: global_load_ushort v2, v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x4
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_dword v[0:1], v8 offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:20
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:16
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v9 offset:20
+; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2 offset:24
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p4_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p4_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p4.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_short v[0:1], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz16_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x8
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz32_align_8_8(ptr addrspace(0) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz16_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x8
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p5_sz32_align_16_16(ptr addrspace(0) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p0_p5_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p5.i64(ptr addrspace(0) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3]
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10
+; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5
+; CHECK-NEXT: v_lshl_or_b32 v5, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v6, v11, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v8, v12, 8, v13
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15
+; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v10, v18, 8, v2
+; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5
+; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7
+; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:17
+; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v31, v[2:3]
+; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29)
+; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v5
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25)
+; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23)
+; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22)
+; CHECK-NEXT: v_lshlrev_b16 v12, 8, v12
+; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19)
+; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17)
+; CHECK-NEXT: v_lshl_or_b32 v3, v16, 8, v17
+; CHECK-NEXT: v_lshl_or_b32 v16, v6, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: v_lshl_or_b32 v15, v18, 8, v19
+; CHECK-NEXT: v_lshl_or_b32 v7, v9, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21
+; CHECK-NEXT: v_lshl_or_b32 v8, v14, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: v_lshl_or_b32 v18, v22, 8, v23
+; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v16
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v17, v24, 8, v25
+; CHECK-NEXT: v_lshl_or_b32 v6, v15, 16, v11
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: v_lshl_or_b32 v20, v26, 8, v27
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: v_lshl_or_b32 v19, v28, 8, v29
+; CHECK-NEXT: v_lshl_or_b32 v4, v18, 16, v17
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: v_lshl_or_b32 v21, v30, 8, v31
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v22, v32, 8, v33
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_or_b32_e32 v12, v12, v2
+; CHECK-NEXT: v_lshl_or_b32 v3, v20, 16, v19
+; CHECK-NEXT: v_lshl_or_b32 v2, v22, 16, v21
+; CHECK-NEXT: global_store_byte v[0:1], v13, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v12, off offset:28
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:29
+; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:31
+; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25
+; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:27
+; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15
+; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13
+; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23
+; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:21
+; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:19
+; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:17
+; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11
+; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9
+; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7
+; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5
+; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:1
+; CHECK-NEXT: flat_load_ubyte v33, v[2:3]
+; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:3
+; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25)
+; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10
+; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5
+; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13
+; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15
+; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v8, v18, 8, v19
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v10, v22, 8, v23
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v15, v24, 8, v25
+; CHECK-NEXT: v_lshl_or_b32 v9, v12, 16, v14
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v13, v26, 8, v27
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v17, v28, 8, v29
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v33
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v19, v34, 8, v2
+; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16
+; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v5, v[2:3]
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v7
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v19, v[2:3]
+; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v8, v6, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: v_lshl_or_b32 v7, v12, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: v_lshl_or_b32 v3, v16, 16, v17
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v2, v18, 16, v19
+; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_byte v[0:1], v20, off offset:30
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30
+; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28
+; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26
+; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:14
+; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:12
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:10
+; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:8
+; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22
+; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20
+; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18
+; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6
+; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4
+; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2
+; CHECK-NEXT: flat_load_ushort v19, v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v4, v6, 16, v9
+; CHECK-NEXT: v_lshl_or_b32 v9, v7, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v3, v12, 16, v13
+; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v11
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v2, v14, 16, v15
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v7, v16, 16, v17
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v6, v18, 16, v19
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x4
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
+; CHECK-NEXT: flat_load_dword v8, v[2:3] offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:20
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:30
+; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:16
+; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
+; CHECK-NEXT: flat_load_dword v9, v[2:3] offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
+; CHECK-NEXT: flat_load_dword v2, v[2:3] offset:24
+; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p0_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p0_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x4
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:20
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
+; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
+; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p1_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p1_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b64 v[7:8], v2
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:8
+; CHECK-NEXT: ds_read_b32 v9, v2 offset:24
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_byte v[0:1], v2, off offset:30
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b64 v[7:8], v2
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:8
+; CHECK-NEXT: ds_read_b32 v9, v2 offset:24
+; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_byte v[0:1], v2, off offset:30
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1
+; CHECK-NEXT: ds_read_b32 v7, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:30
+; CHECK-NEXT: ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
+; CHECK-NEXT: ds_read_u16 v2, v2 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_short v[0:1], v2, off offset:28
+; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1
+; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b32 v7, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:30
+; CHECK-NEXT: ds_read2_b32 v[8:9], v2 offset0:5 offset1:6
+; CHECK-NEXT: ds_read_u16 v2, v2 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_short v[0:1], v2, off offset:28
+; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p3_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p3_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x4
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:24
+; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: global_store_dword v[0:1], v8, off offset:16
+; CHECK-NEXT: global_load_dword v8, v[2:3], off offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:20
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:30
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:16
+; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:16
+; CHECK-NEXT: global_load_dword v9, v[2:3], off offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dword v[0:1], v9, off offset:20
+; CHECK-NEXT: global_load_dword v2, v[2:3], off offset:24
+; CHECK-NEXT: global_store_byte v[0:1], v8, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:24
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p4_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v6, v12, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13
+; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v11, v2, 8, v17
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: v_lshl_or_b32 v2, v4, 8, v3
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: v_lshl_or_b32 v3, v6, 8, v5
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: v_lshlrev_b16 v4, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10
+; CHECK-NEXT: v_lshl_or_b32 v8, v3, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v5, v13, 8, v12
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v6, v15, 8, v14
+; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v13, v19, 8, v18
+; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v10, v21, 8, v20
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v11, v23, 8, v22
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: v_lshl_or_b32 v15, v26, 8, v25
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: v_lshl_or_b32 v14, v24, 8, v28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v27
+; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: v_or_b32_e32 v18, v4, v32
+; CHECK-NEXT: v_lshl_or_b32 v4, v13, 16, v12
+; CHECK-NEXT: v_lshl_or_b32 v7, v15, 16, v14
+; CHECK-NEXT: v_lshl_or_b32 v6, v16, 16, v17
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_byte v[0:1], v33, off offset:30
+; CHECK-NEXT: global_store_short v[0:1], v18, off offset:28
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13
+; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29
+; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v31
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v33
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15
+; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16
+; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v8, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v7, v15, 16, v14
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v6, v17, 16, v16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: global_store_short v[0:1], v18, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_byte v[0:1], v19, off offset:30
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v9, v16, 16, v15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v8, v18, 16, v17
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz16_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x8
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz32_align_8_8(ptr addrspace(1) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz16_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x8
+; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p5_sz32_align_16_16(ptr addrspace(1) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p1_p5_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2]
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9
+; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v12
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v6, v13, 8, v14
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v1
+; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:17
+; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v30, v[1:2]
+; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29)
+; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v4
+; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27)
+; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25)
+; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23)
+; CHECK-NEXT: v_lshl_or_b32 v9, v9, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22)
+; CHECK-NEXT: v_lshlrev_b16 v11, 8, v11
+; CHECK-NEXT: v_lshl_or_b32 v4, v3, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19)
+; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17)
+; CHECK-NEXT: v_lshl_or_b32 v2, v15, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: v_lshl_or_b32 v10, v17, 8, v18
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: v_lshl_or_b32 v8, v19, 8, v20
+; CHECK-NEXT: v_lshl_or_b32 v13, v13, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: v_lshl_or_b32 v14, v21, 8, v22
+; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v6, v23, 8, v24
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: v_lshl_or_b32 v16, v25, 8, v26
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28
+; CHECK-NEXT: v_lshl_or_b32 v3, v14, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: v_lshl_or_b32 v17, v29, 8, v30
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v32
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_or_b32_e32 v11, v11, v1
+; CHECK-NEXT: v_lshl_or_b32 v1, v10, 16, v8
+; CHECK-NEXT: v_lshl_or_b32 v6, v16, 16, v15
+; CHECK-NEXT: v_lshl_or_b32 v5, v18, 16, v17
+; CHECK-NEXT: ds_write_b8 v0, v12 offset:30
+; CHECK-NEXT: ds_write_b32 v0, v13 offset:24
+; CHECK-NEXT: ds_write_b16 v0, v11 offset:28
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16
+; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:31
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:17
+; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v32, v[1:2]
+; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9
+; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12
+; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v6, v17, 8, v18
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v9, v19, 8, v20
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v8, v21, 8, v22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v13, v23, 8, v24
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v14, v29, 8, v30
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v16, v31, 8, v32
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v17, v33, 8, v1
+; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT: v_lshl_or_b32 v6, v11, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v4, v[1:2]
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v1, v8, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v5
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v6
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v17, v[1:2]
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v4, v3, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v3, v5, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: v_lshl_or_b32 v2, v10, 16, v11
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: v_lshl_or_b32 v1, v12, 16, v13
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: v_lshl_or_b32 v5, v16, 16, v17
+; CHECK-NEXT: ds_write_b16 v0, v7 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: ds_write_b8 v0, v18 offset:30
+; CHECK-NEXT: ds_write_b32 v0, v8 offset:24
+; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16
+; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v18, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v7
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v9
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v3, v13, 16, v14
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v18
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
+; CHECK-NEXT: flat_load_dwordx3 v[7:9], v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ushort v1, v[1:2] offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT: ds_write2_b32 v0, v7, v8 offset0:4 offset1:5
+; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(4)
+; CHECK-NEXT: ds_write_b8 v0, v10 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT: ds_write_b16 v0, v1 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30
+; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: ds_write_b32 v0, v8 offset:16
+; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_write_b32 v0, v8 offset:20
+; CHECK-NEXT: flat_load_dword v1, v[1:2] offset:24
+; CHECK-NEXT: ds_write_b8 v0, v7 offset:30
+; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT: ds_write_b32 v0, v1 offset:24
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p0_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p0_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p0.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
+; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p1_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p1_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p1.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v7, v1 offset:30
+; CHECK-NEXT: ds_read_u16 v8, v1 offset:28
+; CHECK-NEXT: ds_read_b32 v9, v1 offset:24
+; CHECK-NEXT: ds_read_b64 v[5:6], v1 offset:16
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b8 v0, v7 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b16 v0, v8 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset0:2 offset1:3
+; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v7, v1 offset:30
+; CHECK-NEXT: ds_read_u16 v8, v1 offset:28
+; CHECK-NEXT: ds_read_b32 v9, v1 offset:24
+; CHECK-NEXT: ds_read_b64 v[5:6], v1 offset:16
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b8 v0, v7 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b16 v0, v8 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset0:2 offset1:3
+; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT: ds_read2_b32 v[6:7], v1 offset0:4 offset1:5
+; CHECK-NEXT: ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT: ds_read_u8 v9, v1 offset:30
+; CHECK-NEXT: ds_read_u16 v1, v1 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write2_b32 v0, v6, v7 offset0:4 offset1:5
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b32 v0, v8 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b8 v0, v9 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b16 v0, v1 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset0:2 offset1:3
+; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[1:4], v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b32 v[5:6], v1 offset0:4 offset1:5
+; CHECK-NEXT: ds_read_b32 v7, v1 offset:24
+; CHECK-NEXT: ds_read_u8 v8, v1 offset:30
+; CHECK-NEXT: ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT: ds_read_b128 v[1:4], v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write2_b32 v0, v5, v6 offset0:4 offset1:5
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b32 v0, v7 offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b8 v0, v8 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p3_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p3_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v1 offset:16
+; CHECK-NEXT: ds_read_b128 v[6:9], v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[2:5] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[6:9]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT: ds_write_b32 v0, v9 offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
+; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx3 v[7:9], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: ds_write2_b32 v0, v8, v9 offset0:5 offset1:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: ds_write_b32 v0, v7 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b16 v0, v10 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p4_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p4_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b128 v0, v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p4.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v5, v11, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12
+; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v10, v1, 8, v16
+; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5
+; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 8, v4
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: v_lshlrev_b16 v3, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v7, v10, 8, v9
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v4, v12, 8, v11
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v5, v14, 8, v13
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15
+; CHECK-NEXT: v_lshl_or_b32 v16, v2, 16, v1
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v17
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v8, v20, 8, v19
+; CHECK-NEXT: v_lshl_or_b32 v1, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v9, v22, 8, v21
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v24
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: v_lshl_or_b32 v12, v23, 8, v27
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v14, v28, 8, v26
+; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v15, v30, 8, v29
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: v_or_b32_e32 v17, v3, v31
+; CHECK-NEXT: v_lshl_or_b32 v3, v11, 16, v10
+; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12
+; CHECK-NEXT: v_lshl_or_b32 v5, v14, 16, v15
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v32 offset:30
+; CHECK-NEXT: ds_write_b32 v0, v16 offset:24
+; CHECK-NEXT: ds_write_b16 v0, v17 offset:28
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12
+; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28
+; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v32
+; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5
+; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9
+; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14
+; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15
+; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v18, v3, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v5, v15, 16, v14
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b16 v0, v16 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v17 offset:30
+; CHECK-NEXT: ds_write_b32 v0, v18 offset:24
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16
+; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz16_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x8
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: ds_write2_b32 v0, v6, v7 offset0:5 offset1:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: ds_write_b32 v0, v8 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b8 v0, v1 offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz32_align_8_8(ptr addrspace(3) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz16_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x8
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: ds_write2_b32 v0, v7, v8 offset0:5 offset1:6
+; CHECK-NEXT: ds_write_b32 v0, v6 offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: ds_write_b8 v0, v10 offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[2:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p3_p5_sz32_align_16_16(ptr addrspace(3) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p3_p5_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: ds_write_b128 v0, v[2:5]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b128 v0, v[6:9] offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p3.p5.i64(ptr addrspace(3) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:17
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(28)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(26)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(21)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:31
+; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:29
+; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:27
+; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:25
+; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:23
+; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:21
+; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:19
+; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:17
+; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15
+; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:13
+; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11
+; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9
+; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7
+; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5
+; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:3
+; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:1
+; CHECK-NEXT: flat_load_ubyte v1, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(31) lgkmcnt(31)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(28)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(26)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(21)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v1, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v1, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(0) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28
+; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26
+; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22
+; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14
+; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12
+; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10
+; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8
+; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6
+; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4
+; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2
+; CHECK-NEXT: flat_load_ushort v1, v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x4
+; CHECK-NEXT: flat_load_dword v7, v[1:2] offset:16
+; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:24
+; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:30
+; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:28
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: flat_load_dword v1, v[1:2] offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(0) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:30
+; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:16
+; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: flat_load_dword v8, v[1:2] offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: flat_load_dword v1, v[1:2] offset:24
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p0_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(0) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p0_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:29
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:27
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:25
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:23
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:21
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:19
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:17
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:15
+; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:31
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:29
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:27
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:25
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:23
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:21
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:19
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:17
+; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:15
+; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: global_load_ushort v3, v[1:2], off
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(1) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(1) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p1_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(1) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p1_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v2, v1 offset:15
+; CHECK-NEXT: ds_read_u8 v3, v1 offset:14
+; CHECK-NEXT: ds_read_u8 v4, v1 offset:13
+; CHECK-NEXT: ds_read_u8 v5, v1 offset:12
+; CHECK-NEXT: ds_read_u8 v6, v1 offset:11
+; CHECK-NEXT: ds_read_u8 v7, v1 offset:10
+; CHECK-NEXT: ds_read_u8 v8, v1 offset:9
+; CHECK-NEXT: ds_read_u8 v9, v1 offset:8
+; CHECK-NEXT: ds_read_u8 v10, v1 offset:7
+; CHECK-NEXT: ds_read_u8 v11, v1 offset:6
+; CHECK-NEXT: ds_read_u8 v12, v1 offset:5
+; CHECK-NEXT: ds_read_u8 v13, v1 offset:4
+; CHECK-NEXT: ds_read_u8 v14, v1 offset:3
+; CHECK-NEXT: ds_read_u8 v15, v1 offset:2
+; CHECK-NEXT: ds_read_u8 v16, v1 offset:1
+; CHECK-NEXT: ds_read_u8 v1, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt lgkmcnt(14)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(11)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt lgkmcnt(10)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(9)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(6)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v2, v1 offset:24
+; CHECK-NEXT: ds_read_u8 v3, v1 offset:25
+; CHECK-NEXT: ds_read_u8 v4, v1 offset:26
+; CHECK-NEXT: ds_read_u8 v5, v1 offset:27
+; CHECK-NEXT: ds_read_u8 v6, v1 offset:28
+; CHECK-NEXT: ds_read_u8 v7, v1 offset:29
+; CHECK-NEXT: ds_read_u8 v8, v1 offset:30
+; CHECK-NEXT: ds_read_u8 v9, v1 offset:16
+; CHECK-NEXT: ds_read_u8 v10, v1 offset:17
+; CHECK-NEXT: ds_read_u8 v11, v1 offset:18
+; CHECK-NEXT: ds_read_u8 v12, v1 offset:19
+; CHECK-NEXT: ds_read_u8 v13, v1 offset:20
+; CHECK-NEXT: ds_read_u8 v14, v1 offset:21
+; CHECK-NEXT: ds_read_u8 v15, v1 offset:22
+; CHECK-NEXT: ds_read_u8 v16, v1 offset:23
+; CHECK-NEXT: ds_read_u8 v17, v1 offset:8
+; CHECK-NEXT: ds_read_u8 v18, v1 offset:9
+; CHECK-NEXT: ds_read_u8 v19, v1 offset:10
+; CHECK-NEXT: ds_read_u8 v20, v1 offset:11
+; CHECK-NEXT: ds_read_u8 v21, v1 offset:12
+; CHECK-NEXT: ds_read_u8 v22, v1 offset:13
+; CHECK-NEXT: ds_read_u8 v23, v1 offset:14
+; CHECK-NEXT: ds_read_u8 v24, v1 offset:15
+; CHECK-NEXT: ds_read_u8 v25, v1
+; CHECK-NEXT: ds_read_u8 v26, v1 offset:1
+; CHECK-NEXT: ds_read_u8 v27, v1 offset:2
+; CHECK-NEXT: ds_read_u8 v28, v1 offset:3
+; CHECK-NEXT: ds_read_u8 v29, v1 offset:4
+; CHECK-NEXT: ds_read_u8 v30, v1 offset:5
+; CHECK-NEXT: ds_read_u8 v31, v1 offset:6
+; CHECK-NEXT: ds_read_u8 v1, v1 offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(24)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(16)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(3) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v2, v1 offset:24
+; CHECK-NEXT: ds_read_u8 v3, v1 offset:25
+; CHECK-NEXT: ds_read_u8 v4, v1 offset:26
+; CHECK-NEXT: ds_read_u8 v5, v1 offset:27
+; CHECK-NEXT: ds_read_u8 v6, v1 offset:28
+; CHECK-NEXT: ds_read_u8 v7, v1 offset:29
+; CHECK-NEXT: ds_read_u8 v8, v1 offset:30
+; CHECK-NEXT: ds_read_u8 v9, v1 offset:31
+; CHECK-NEXT: ds_read_u8 v10, v1 offset:16
+; CHECK-NEXT: ds_read_u8 v11, v1 offset:17
+; CHECK-NEXT: ds_read_u8 v12, v1 offset:18
+; CHECK-NEXT: ds_read_u8 v13, v1 offset:19
+; CHECK-NEXT: ds_read_u8 v14, v1 offset:20
+; CHECK-NEXT: ds_read_u8 v15, v1 offset:21
+; CHECK-NEXT: ds_read_u8 v16, v1 offset:22
+; CHECK-NEXT: ds_read_u8 v17, v1 offset:23
+; CHECK-NEXT: ds_read_u8 v18, v1 offset:8
+; CHECK-NEXT: ds_read_u8 v19, v1 offset:9
+; CHECK-NEXT: ds_read_u8 v20, v1 offset:10
+; CHECK-NEXT: ds_read_u8 v21, v1 offset:11
+; CHECK-NEXT: ds_read_u8 v22, v1 offset:12
+; CHECK-NEXT: ds_read_u8 v23, v1 offset:13
+; CHECK-NEXT: ds_read_u8 v24, v1 offset:14
+; CHECK-NEXT: ds_read_u8 v25, v1 offset:15
+; CHECK-NEXT: ds_read_u8 v26, v1
+; CHECK-NEXT: ds_read_u8 v27, v1 offset:1
+; CHECK-NEXT: ds_read_u8 v28, v1 offset:2
+; CHECK-NEXT: ds_read_u8 v29, v1 offset:3
+; CHECK-NEXT: ds_read_u8 v30, v1 offset:4
+; CHECK-NEXT: ds_read_u8 v31, v1 offset:5
+; CHECK-NEXT: ds_read_u8 v32, v1 offset:6
+; CHECK-NEXT: ds_read_u8 v1, v1 offset:7
+; CHECK-NEXT: s_waitcnt lgkmcnt(24)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:31
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(16)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u16 v2, v1
+; CHECK-NEXT: ds_read_u16 v3, v1 offset:2
+; CHECK-NEXT: ds_read_u16 v4, v1 offset:4
+; CHECK-NEXT: ds_read_u16 v5, v1 offset:6
+; CHECK-NEXT: ds_read_u16 v6, v1 offset:8
+; CHECK-NEXT: ds_read_u16 v7, v1 offset:10
+; CHECK-NEXT: ds_read_u16 v8, v1 offset:12
+; CHECK-NEXT: ds_read_u16 v1, v1 offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u8 v2, v1 offset:30
+; CHECK-NEXT: ds_read_u16 v3, v1 offset:28
+; CHECK-NEXT: ds_read_u16 v4, v1 offset:26
+; CHECK-NEXT: ds_read_u16 v5, v1 offset:24
+; CHECK-NEXT: ds_read_u16 v6, v1 offset:22
+; CHECK-NEXT: ds_read_u16 v7, v1 offset:20
+; CHECK-NEXT: ds_read_u16 v8, v1 offset:18
+; CHECK-NEXT: ds_read_u16 v9, v1 offset:16
+; CHECK-NEXT: ds_read_u16 v10, v1 offset:14
+; CHECK-NEXT: ds_read_u16 v11, v1 offset:12
+; CHECK-NEXT: ds_read_u16 v12, v1 offset:10
+; CHECK-NEXT: ds_read_u16 v13, v1 offset:8
+; CHECK-NEXT: ds_read_u16 v14, v1 offset:6
+; CHECK-NEXT: ds_read_u16 v15, v1 offset:4
+; CHECK-NEXT: ds_read_u16 v16, v1 offset:2
+; CHECK-NEXT: ds_read_u16 v1, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(14)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(11)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt lgkmcnt(10)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(9)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(3) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_u16 v2, v1 offset:30
+; CHECK-NEXT: ds_read_u16 v3, v1 offset:28
+; CHECK-NEXT: ds_read_u16 v4, v1 offset:26
+; CHECK-NEXT: ds_read_u16 v5, v1 offset:24
+; CHECK-NEXT: ds_read_u16 v6, v1 offset:22
+; CHECK-NEXT: ds_read_u16 v7, v1 offset:20
+; CHECK-NEXT: ds_read_u16 v8, v1 offset:18
+; CHECK-NEXT: ds_read_u16 v9, v1 offset:16
+; CHECK-NEXT: ds_read_u16 v10, v1 offset:14
+; CHECK-NEXT: ds_read_u16 v11, v1 offset:12
+; CHECK-NEXT: ds_read_u16 v12, v1 offset:10
+; CHECK-NEXT: ds_read_u16 v13, v1 offset:8
+; CHECK-NEXT: ds_read_u16 v14, v1 offset:6
+; CHECK-NEXT: ds_read_u16 v15, v1 offset:4
+; CHECK-NEXT: ds_read_u16 v16, v1 offset:2
+; CHECK-NEXT: ds_read_u16 v1, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(15)
+; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(14)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(13)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt lgkmcnt(12)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(11)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt lgkmcnt(10)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(9)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt lgkmcnt(8)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(7)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt lgkmcnt(6)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt lgkmcnt(5)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b32 v[5:6], v1 offset0:4 offset1:5
+; CHECK-NEXT: ds_read_b32 v7, v1 offset:24
+; CHECK-NEXT: ds_read_u16 v8, v1 offset:28
+; CHECK-NEXT: ds_read_u8 v9, v1 offset:30
+; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(3) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[1:4], v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read2_b32 v[5:6], v1 offset0:4 offset1:5
+; CHECK-NEXT: ds_read_b32 v7, v1 offset:24
+; CHECK-NEXT: ds_read_u16 v8, v1 offset:28
+; CHECK-NEXT: ds_read_u8 v9, v1 offset:30
+; CHECK-NEXT: ds_read_b128 v[1:4], v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(4)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt lgkmcnt(3)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p3_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(3) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p3_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ds_read_b128 v[2:5], v1
+; CHECK-NEXT: ds_read_b128 v[6:9], v1 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:29
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:27
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:25
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:23
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:21
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:19
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:17
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:15
+; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(4) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:31
+; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:29
+; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:27
+; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:25
+; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:23
+; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:21
+; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:19
+; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:17
+; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:15
+; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:13
+; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:11
+; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:9
+; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:7
+; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:5
+; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:3
+; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:1
+; CHECK-NEXT: global_load_ubyte v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: global_load_ushort v3, v[1:2], off
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(4) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30
+; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26
+; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24
+; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20
+; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18
+; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14
+; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12
+; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10
+; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8
+; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6
+; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4
+; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2
+; CHECK-NEXT: global_load_ushort v1, v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(4) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p4_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(4) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p4_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz16_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz31_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1e
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 1 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz32_align_1_1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1f
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(21)
+; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(19)
+; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(17)
+; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz16_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz31_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz32_align_2_2:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0xf
+; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(14)
+; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(13)
+; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT: s_waitcnt vmcnt(10)
+; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(9)
+; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz16_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz16_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz31_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x8
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz32_align_8_8(ptr addrspace(5) align 8 %dst, ptr addrspace(5) align 8 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz32_align_8_8:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 32, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz16_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz16_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x3
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz31_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x8
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p5_p5_sz32_align_16_16(ptr addrspace(5) align 16 %dst, ptr addrspace(5) align 16 readonly %src) {
+; CHECK-LABEL: memmove_p5_p5_sz32_align_16_16:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 32, i1 false)
+ ret void
+}
+
+declare void @llvm.memmove.p0.p0.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p0.p1.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p0.p3.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p0.p4.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p1.p0.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p1.p3.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p1.p4.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p3.p0.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p3.p1.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p3.p3.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p3.p4.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p3.p5.i64(ptr addrspace(3) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p5.p0.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(0) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p5.p3.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p5.p4.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll
new file mode 100644
index 0000000000000..1b8483a54bb3b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memmove-scalar-load.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s
+
+; Testing codegen for memmove with scalar reads.
+
+
+define void @memmove_p1_p4_sz16_align_4_4(ptr addrspace(1) align 4 %dst, ptr addrspace(4) align 4 readonly inreg %src) {
+; CHECK-LABEL: memmove_p1_p4_sz16_align_4_4:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: v_mov_b32_e32 v3, s5
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 4 %dst, ptr addrspace(4) noundef nonnull align 4 %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz31_align_4_4(ptr addrspace(1) align 4 %dst, ptr addrspace(4) align 4 readonly inreg %src) {
+; CHECK-LABEL: memmove_p1_p4_sz31_align_4_4:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: global_load_ubyte v9, v2, s[4:5] offset:30
+; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: v_mov_b32_e32 v3, s5
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: v_mov_b32_e32 v10, s11
+; CHECK-NEXT: v_mov_b32_e32 v6, s8
+; CHECK-NEXT: v_mov_b32_e32 v7, s9
+; CHECK-NEXT: v_mov_b32_e32 v8, s10
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 4 %dst, ptr addrspace(4) noundef nonnull align 4 %src, i64 31, i1 false)
+ ret void
+}
+
+define void @memmove_p1_p4_sz32_align_4_4(ptr addrspace(1) align 4 %dst, ptr addrspace(4) align 4 readonly inreg %src) {
+; CHECK-LABEL: memmove_p1_p4_sz32_align_4_4:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, s8
+; CHECK-NEXT: v_mov_b32_e32 v3, s9
+; CHECK-NEXT: v_mov_b32_e32 v4, s10
+; CHECK-NEXT: v_mov_b32_e32 v5, s11
+; CHECK-NEXT: v_mov_b32_e32 v9, s7
+; CHECK-NEXT: v_mov_b32_e32 v8, s6
+; CHECK-NEXT: v_mov_b32_e32 v7, s5
+; CHECK-NEXT: v_mov_b32_e32 v6, s4
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p1.p4.i64(ptr addrspace(1) noundef nonnull align 4 %dst, ptr addrspace(4) noundef nonnull align 4 %src, i64 32, i1 false)
+ ret void
+}
+
+declare void @llvm.memmove.p1.p4.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(4) nocapture readonly, i64, i1 immarg) #2
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+
More information about the llvm-commits
mailing list