[llvm] 2c091e6 - AMDGPU: Report unaligned scratch access as fast if supported by tgt (#158036)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 15 03:03:06 PDT 2025
Author: macurtis-amd
Date: 2025-09-15T05:03:02-05:00
New Revision: 2c091e6aec2d48fbcafc9cc5909a62f0321db1fd
URL: https://github.com/llvm/llvm-project/commit/2c091e6aec2d48fbcafc9cc5909a62f0321db1fd
DIFF: https://github.com/llvm/llvm-project/commit/2c091e6aec2d48fbcafc9cc5909a62f0321db1fd.diff
LOG: AMDGPU: Report unaligned scratch access as fast if supported by tgt (#158036)
This enables more consecutive load folding during
aggressive-instcombine.
The original motivating example provided by Jeff Byrnes:
https://godbolt.org/z/8ebcTEjTs
Example provided by Nikita Popov: https://godbolt.org/z/Gv1j4vjqE as
part of my original attempt to fix the issue (PR
[#133301](https://github.com/llvm/llvm-project/pull/133301), see his
[comment](https://github.com/llvm/llvm-project/pull/133301#issuecomment-2984905809)).
This changes the value of `IsFast` returned by `In
SITargetLowering::allowsMisalignedMemoryAccessesImpl` to be non-zero for
private and flat addresses if the subtarget supports unaligned scratch
accesses.
This enables aggressive-instcombine to do more folding of consecutive
loads (see
[here](https://github.com/llvm/llvm-project/blob/cbd496581fb6953a9a8d8387a010cc3a67d4654b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp#L811)).
Summary performance impact on
[composable_kernel](https://github.com/ROCm/composable_kernel):
|GPU|speedup (geomean*)|
|---|---|
|MI300A| 1.11|
|MI300X| 1.14|
|MI350X| 1.03|
[*] Just to be clear, this is the geomean across kernels which were
impacted by this change - not across all CK kernels.
Added:
llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3332723b038f5..9acc4b6de3501 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2098,10 +2098,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
bool AlignedBy4 = Alignment >= Align(4);
+ if (Subtarget->hasUnalignedScratchAccessEnabled()) {
+ if (IsFast)
+ *IsFast = AlignedBy4 ? Size : 1;
+ return true;
+ }
+
if (IsFast)
*IsFast = AlignedBy4;
- return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
+ return AlignedBy4;
}
// So long as they are correct, wide global memory operations perform better
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
index 37a261cab7563..e8bd640aa5409 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
@@ -7,23 +7,25 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
; MUBUF-LABEL: memcpy_fixed_align:
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; MUBUF-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24
; MUBUF-NEXT: s_lshr_b32 s4, s32, 6
; MUBUF-NEXT: s_waitcnt vmcnt(2)
-; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32
-; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
-; MUBUF-NEXT: s_waitcnt vmcnt(3)
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32
-; MUBUF-NEXT: s_waitcnt vmcnt(6)
+; MUBUF-NEXT: s_waitcnt vmcnt(5)
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16
+; MUBUF-NEXT: s_waitcnt vmcnt(8)
+; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:36
+; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:32
+; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:28
+; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; use s4
; MUBUF-NEXT: ;;#ASMEND
@@ -35,14 +37,14 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
-; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
+; FLATSCR-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24
; FLATSCR-NEXT: s_mov_b32 s0, s32
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 offset:16
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s32 offset:24
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s0
; FLATSCR-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0003366f3a3ea..5b7c36559a366 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -12,21 +12,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v12, s3
-; CHECK-NEXT: v_mov_b32_e32 v11, s2
-; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
-; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
-; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
-; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
-; CHECK-NEXT: v_mov_b32_e32 v12, s1
-; CHECK-NEXT: v_mov_b32_e32 v11, s0
+; CHECK-NEXT: v_mov_b32_e32 v9, s3
+; CHECK-NEXT: v_mov_b32_e32 v8, s2
+; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
+; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
+; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
+; CHECK-NEXT: v_mov_b32_e32 v9, s1
+; CHECK-NEXT: v_mov_b32_e32 v8, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
-; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
-; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
-; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
-; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
+; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
+; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -173,33 +171,33 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: v_mov_b32_e32 v26, s0
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
-; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -213,10 +211,10 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@@ -281,8 +279,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
-; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
-; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
+; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
@@ -302,21 +300,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v12, s3
-; CHECK-NEXT: v_mov_b32_e32 v11, s2
-; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
-; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
-; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
-; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
-; CHECK-NEXT: v_mov_b32_e32 v12, s1
-; CHECK-NEXT: v_mov_b32_e32 v11, s0
+; CHECK-NEXT: v_mov_b32_e32 v9, s3
+; CHECK-NEXT: v_mov_b32_e32 v8, s2
+; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
+; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
+; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
+; CHECK-NEXT: v_mov_b32_e32 v9, s1
+; CHECK-NEXT: v_mov_b32_e32 v8, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
-; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
-; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
-; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
-; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
+; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
+; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -463,33 +459,33 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: v_mov_b32_e32 v26, s0
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
-; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -503,10 +499,10 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@@ -571,8 +567,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
-; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
-; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
+; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
index b43ccc551ca95..048610184368d 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
@@ -27,19 +27,16 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -83,19 +80,16 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -239,19 +233,16 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -295,19 +286,16 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -437,7 +425,7 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -451,19 +439,15 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -475,8 +459,8 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@@ -492,7 +476,7 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -506,19 +490,15 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -530,8 +510,8 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@@ -643,12 +623,9 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -660,24 +637,16 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24
-; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:8
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -689,18 +658,13 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -712,12 +676,9 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -729,24 +690,16 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24
-; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:8
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -758,18 +711,13 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -895,22 +843,20 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -964,22 +910,20 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -1161,15 +1105,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x2
-; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23
-; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1211,15 +1155,15 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x2
-; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23
-; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1929,18 +1873,18 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x7
-; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1994,18 +1938,18 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x7
-; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -3267,19 +3211,16 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3334,19 +3275,16 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3525,24 +3463,21 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3592,24 +3527,21 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -3783,25 +3715,20 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT: ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT: ds_read_u8 v10, v1 offset:30
; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: ds_read_b64 v[8:9], v1 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3850,25 +3777,20 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT: ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT: ds_read_u8 v10, v1 offset:30
; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: ds_read_b64 v[8:9], v1 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4037,24 +3959,21 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4104,24 +4023,21 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4302,34 +4218,31 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4398,34 +4311,31 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index f08ea27040fb5..01b7f40f6256f 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -471,7 +471,7 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -489,7 +489,7 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
@@ -509,8 +509,8 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@@ -526,7 +526,7 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,7 +544,7 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
@@ -564,8 +564,8 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@@ -2077,21 +2077,23 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -2143,21 +2145,23 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
diff --git a/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
new file mode 100644
index 0000000000000..05d2330fffc7f
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
@@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=sroa,instcombine,aggressive-instcombine %s -S -o - | FileCheck %s
+
+define i64 @quux(ptr %arg) {
+; CHECK-LABEL: define i64 @quux(
+; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[ARG]], align 1
+; CHECK-NEXT: ret i64 [[LOAD]]
+;
+bb:
+ %load = load i8, ptr %arg, align 1
+ %getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 1
+ %load1 = load i8, ptr %getelementptr, align 1
+ %getelementptr2 = getelementptr inbounds nuw i8, ptr %arg, i64 2
+ %load3 = load i8, ptr %getelementptr2, align 1
+ %getelementptr4 = getelementptr inbounds nuw i8, ptr %arg, i64 3
+ %load5 = load i8, ptr %getelementptr4, align 1
+ %getelementptr6 = getelementptr inbounds nuw i8, ptr %arg, i64 4
+ %load7 = load i8, ptr %getelementptr6, align 1
+ %getelementptr8 = getelementptr inbounds nuw i8, ptr %arg, i64 5
+ %load9 = load i8, ptr %getelementptr8, align 1
+ %getelementptr10 = getelementptr inbounds nuw i8, ptr %arg, i64 6
+ %load11 = load i8, ptr %getelementptr10, align 1
+ %getelementptr12 = getelementptr inbounds nuw i8, ptr %arg, i64 7
+ %load13 = load i8, ptr %getelementptr12, align 1
+ %zext = zext i8 %load13 to i64
+ %shl = shl nuw i64 %zext, 56
+ %zext14 = zext i8 %load11 to i64
+ %shl15 = shl nuw nsw i64 %zext14, 48
+ %or = or disjoint i64 %shl, %shl15
+ %zext16 = zext i8 %load9 to i64
+ %shl17 = shl nuw nsw i64 %zext16, 40
+ %or18 = or disjoint i64 %or, %shl17
+ %zext19 = zext i8 %load7 to i64
+ %shl20 = shl nuw nsw i64 %zext19, 32
+ %or21 = or disjoint i64 %or18, %shl20
+ %zext22 = zext i8 %load5 to i64
+ %shl23 = shl nuw nsw i64 %zext22, 24
+ %or24 = or disjoint i64 %or21, %shl23
+ %zext25 = zext i8 %load3 to i64
+ %shl26 = shl nuw nsw i64 %zext25, 16
+ %zext27 = zext i8 %load1 to i64
+ %shl28 = shl nuw nsw i64 %zext27, 8
+ %or29 = or disjoint i64 %or24, %shl26
+ %zext30 = zext i8 %load to i64
+ %or31 = or i64 %or29, %shl28
+ %or32 = or i64 %or31, %zext30
+ ret i64 %or32
+}
+
+
+; The following test case reduced from a client kernel
+define fastcc <16 x float> @hoge(ptr %arg) {
+; CHECK-LABEL: define fastcc <16 x float> @hoge(
+; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[ARG]], align 8
+; CHECK-NEXT: [[LOAD28:%.*]] = load i64, ptr [[LOAD]], align 1
+; CHECK-NEXT: [[GETELEMENTPTR72:%.*]] = getelementptr i8, ptr [[LOAD]], i64 8
+; CHECK-NEXT: [[LOAD73:%.*]] = load i64, ptr [[GETELEMENTPTR72]], align 1
+; CHECK-NEXT: [[GETELEMENTPTR120:%.*]] = getelementptr i8, ptr [[LOAD]], i64 16
+; CHECK-NEXT: [[LOAD121:%.*]] = load i64, ptr [[GETELEMENTPTR120]], align 1
+; CHECK-NEXT: [[GETELEMENTPTR168:%.*]] = getelementptr i8, ptr [[LOAD]], i64 24
+; CHECK-NEXT: [[LOAD169:%.*]] = load i64, ptr [[GETELEMENTPTR168]], align 1
+; CHECK-NEXT: [[CALL:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD28]], i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[CALL225:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD73]], i64 0, <16 x float> [[CALL]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[CALL230:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD121]], i64 0, <16 x float> [[CALL225]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[CALL235:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD169]], i64 0, <16 x float> [[CALL230]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret <16 x float> [[CALL235]]
+;
+bb:
+ %load = load ptr, ptr %arg, align 8
+ %load28 = load i8, ptr %load, align 1
+ %getelementptr30 = getelementptr i8, ptr %load, i64 1
+ %load31 = load i8, ptr %getelementptr30, align 1
+ %getelementptr36 = getelementptr i8, ptr %load, i64 2
+ %load37 = load i8, ptr %getelementptr36, align 1
+ %getelementptr42 = getelementptr i8, ptr %load, i64 3
+ %load43 = load i8, ptr %getelementptr42, align 1
+ %getelementptr48 = getelementptr i8, ptr %load, i64 4
+ %load49 = load i8, ptr %getelementptr48, align 1
+ %getelementptr54 = getelementptr i8, ptr %load, i64 5
+ %load55 = load i8, ptr %getelementptr54, align 1
+ %getelementptr60 = getelementptr i8, ptr %load, i64 6
+ %load61 = load i8, ptr %getelementptr60, align 1
+ %getelementptr66 = getelementptr i8, ptr %load, i64 7
+ %load67 = load i8, ptr %getelementptr66, align 1
+ %getelementptr72 = getelementptr i8, ptr %load, i64 8
+ %load73 = load i8, ptr %getelementptr72, align 1
+ %getelementptr78 = getelementptr i8, ptr %load, i64 9
+ %load79 = load i8, ptr %getelementptr78, align 1
+ %getelementptr84 = getelementptr i8, ptr %load, i64 10
+ %load85 = load i8, ptr %getelementptr84, align 1
+ %getelementptr90 = getelementptr i8, ptr %load, i64 11
+ %load91 = load i8, ptr %getelementptr90, align 1
+ %getelementptr96 = getelementptr i8, ptr %load, i64 12
+ %load97 = load i8, ptr %getelementptr96, align 1
+ %getelementptr102 = getelementptr i8, ptr %load, i64 13
+ %load103 = load i8, ptr %getelementptr102, align 1
+ %getelementptr108 = getelementptr i8, ptr %load, i64 14
+ %load109 = load i8, ptr %getelementptr108, align 1
+ %getelementptr114 = getelementptr i8, ptr %load, i64 15
+ %load115 = load i8, ptr %getelementptr114, align 1
+ %getelementptr120 = getelementptr i8, ptr %load, i64 16
+ %load121 = load i8, ptr %getelementptr120, align 1
+ %getelementptr126 = getelementptr i8, ptr %load, i64 17
+ %load127 = load i8, ptr %getelementptr126, align 1
+ %getelementptr132 = getelementptr i8, ptr %load, i64 18
+ %load133 = load i8, ptr %getelementptr132, align 1
+ %getelementptr138 = getelementptr i8, ptr %load, i64 19
+ %load139 = load i8, ptr %getelementptr138, align 1
+ %getelementptr144 = getelementptr i8, ptr %load, i64 20
+ %load145 = load i8, ptr %getelementptr144, align 1
+ %getelementptr150 = getelementptr i8, ptr %load, i64 21
+ %load151 = load i8, ptr %getelementptr150, align 1
+ %getelementptr156 = getelementptr i8, ptr %load, i64 22
+ %load157 = load i8, ptr %getelementptr156, align 1
+ %getelementptr162 = getelementptr i8, ptr %load, i64 23
+ %load163 = load i8, ptr %getelementptr162, align 1
+ %getelementptr168 = getelementptr i8, ptr %load, i64 24
+ %load169 = load i8, ptr %getelementptr168, align 1
+ %getelementptr174 = getelementptr i8, ptr %load, i64 25
+ %load175 = load i8, ptr %getelementptr174, align 1
+ %getelementptr180 = getelementptr i8, ptr %load, i64 26
+ %load181 = load i8, ptr %getelementptr180, align 1
+ %getelementptr186 = getelementptr i8, ptr %load, i64 27
+ %load187 = load i8, ptr %getelementptr186, align 1
+ %getelementptr192 = getelementptr i8, ptr %load, i64 28
+ %load193 = load i8, ptr %getelementptr192, align 1
+ %getelementptr198 = getelementptr i8, ptr %load, i64 29
+ %load199 = load i8, ptr %getelementptr198, align 1
+ %getelementptr204 = getelementptr i8, ptr %load, i64 30
+ %load205 = load i8, ptr %getelementptr204, align 1
+ %getelementptr210 = getelementptr i8, ptr %load, i64 31
+ %load211 = load i8, ptr %getelementptr210, align 1
+ %alloca1.sroa.8.0.insert.ext = zext i8 %load67 to i64
+ %alloca1.sroa.8.0.insert.shift = shl i64 %alloca1.sroa.8.0.insert.ext, 56
+ %alloca1.sroa.7.0.insert.ext = zext i8 %load61 to i64
+ %alloca1.sroa.7.0.insert.shift = shl i64 %alloca1.sroa.7.0.insert.ext, 48
+ %alloca1.sroa.7.0.insert.insert = or i64 %alloca1.sroa.8.0.insert.shift, %alloca1.sroa.7.0.insert.shift
+ %alloca1.sroa.6.0.insert.ext = zext i8 %load55 to i64
+ %alloca1.sroa.6.0.insert.shift = shl i64 %alloca1.sroa.6.0.insert.ext, 40
+ %alloca1.sroa.6.0.insert.insert = or i64 %alloca1.sroa.7.0.insert.insert, %alloca1.sroa.6.0.insert.shift
+ %alloca1.sroa.5.0.insert.ext = zext i8 %load49 to i64
+ %alloca1.sroa.5.0.insert.shift = shl i64 %alloca1.sroa.5.0.insert.ext, 32
+ %alloca1.sroa.5.0.insert.insert = or i64 %alloca1.sroa.6.0.insert.insert, %alloca1.sroa.5.0.insert.shift
+ %alloca1.sroa.4.0.insert.ext = zext i8 %load43 to i64
+ %alloca1.sroa.4.0.insert.shift = shl i64 %alloca1.sroa.4.0.insert.ext, 24
+ %alloca1.sroa.4.0.insert.insert = or i64 %alloca1.sroa.5.0.insert.insert, %alloca1.sroa.4.0.insert.shift
+ %alloca1.sroa.3.0.insert.ext = zext i8 %load37 to i64
+ %alloca1.sroa.3.0.insert.shift = shl i64 %alloca1.sroa.3.0.insert.ext, 16
+ %alloca1.sroa.2.0.insert.ext = zext i8 %load31 to i64
+ %alloca1.sroa.2.0.insert.shift = shl i64 %alloca1.sroa.2.0.insert.ext, 8
+ %alloca1.sroa.2.0.insert.mask = or i64 %alloca1.sroa.4.0.insert.insert, %alloca1.sroa.3.0.insert.shift
+ %alloca1.sroa.0.0.insert.ext = zext i8 %load28 to i64
+ %alloca1.sroa.0.0.insert.mask = or i64 %alloca1.sroa.2.0.insert.mask, %alloca1.sroa.2.0.insert.shift
+ %alloca1.sroa.0.0.insert.insert = or i64 %alloca1.sroa.0.0.insert.mask, %alloca1.sroa.0.0.insert.ext
+ %call = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.0.0.insert.insert, i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+ %alloca1.sroa.17.8.insert.ext = zext i8 %load115 to i64
+ %alloca1.sroa.17.8.insert.shift = shl i64 %alloca1.sroa.17.8.insert.ext, 56
+ %alloca1.sroa.16.8.insert.ext = zext i8 %load109 to i64
+ %alloca1.sroa.16.8.insert.shift = shl i64 %alloca1.sroa.16.8.insert.ext, 48
+ %alloca1.sroa.16.8.insert.insert = or i64 %alloca1.sroa.17.8.insert.shift, %alloca1.sroa.16.8.insert.shift
+ %alloca1.sroa.15.8.insert.ext = zext i8 %load103 to i64
+ %alloca1.sroa.15.8.insert.shift = shl i64 %alloca1.sroa.15.8.insert.ext, 40
+ %alloca1.sroa.15.8.insert.insert = or i64 %alloca1.sroa.16.8.insert.insert, %alloca1.sroa.15.8.insert.shift
+ %alloca1.sroa.14.8.insert.ext = zext i8 %load97 to i64
+ %alloca1.sroa.14.8.insert.shift = shl i64 %alloca1.sroa.14.8.insert.ext, 32
+ %alloca1.sroa.14.8.insert.insert = or i64 %alloca1.sroa.15.8.insert.insert, %alloca1.sroa.14.8.insert.shift
+ %alloca1.sroa.13.8.insert.ext = zext i8 %load91 to i64
+ %alloca1.sroa.13.8.insert.shift = shl i64 %alloca1.sroa.13.8.insert.ext, 24
+ %alloca1.sroa.13.8.insert.insert = or i64 %alloca1.sroa.14.8.insert.insert, %alloca1.sroa.13.8.insert.shift
+ %alloca1.sroa.12.8.insert.ext = zext i8 %load85 to i64
+ %alloca1.sroa.12.8.insert.shift = shl i64 %alloca1.sroa.12.8.insert.ext, 16
+ %alloca1.sroa.11.8.insert.ext = zext i8 %load79 to i64
+ %alloca1.sroa.11.8.insert.shift = shl i64 %alloca1.sroa.11.8.insert.ext, 8
+ %alloca1.sroa.11.8.insert.mask = or i64 %alloca1.sroa.13.8.insert.insert, %alloca1.sroa.12.8.insert.shift
+ %alloca1.sroa.9.8.insert.ext = zext i8 %load73 to i64
+ %alloca1.sroa.9.8.insert.mask = or i64 %alloca1.sroa.11.8.insert.mask, %alloca1.sroa.11.8.insert.shift
+ %alloca1.sroa.9.8.insert.insert = or i64 %alloca1.sroa.9.8.insert.mask, %alloca1.sroa.9.8.insert.ext
+ %call225 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.9.8.insert.insert, i64 0, <16 x float> %call, i32 0, i32 0, i32 0)
+ %alloca1.sroa.26.16.insert.ext = zext i8 %load163 to i64
+ %alloca1.sroa.26.16.insert.shift = shl i64 %alloca1.sroa.26.16.insert.ext, 56
+ %alloca1.sroa.25.16.insert.ext = zext i8 %load157 to i64
+ %alloca1.sroa.25.16.insert.shift = shl i64 %alloca1.sroa.25.16.insert.ext, 48
+ %alloca1.sroa.25.16.insert.insert = or i64 %alloca1.sroa.26.16.insert.shift, %alloca1.sroa.25.16.insert.shift
+ %alloca1.sroa.24.16.insert.ext = zext i8 %load151 to i64
+ %alloca1.sroa.24.16.insert.shift = shl i64 %alloca1.sroa.24.16.insert.ext, 40
+ %alloca1.sroa.24.16.insert.insert = or i64 %alloca1.sroa.25.16.insert.insert, %alloca1.sroa.24.16.insert.shift
+ %alloca1.sroa.23.16.insert.ext = zext i8 %load145 to i64
+ %alloca1.sroa.23.16.insert.shift = shl i64 %alloca1.sroa.23.16.insert.ext, 32
+ %alloca1.sroa.23.16.insert.insert = or i64 %alloca1.sroa.24.16.insert.insert, %alloca1.sroa.23.16.insert.shift
+ %alloca1.sroa.22.16.insert.ext = zext i8 %load139 to i64
+ %alloca1.sroa.22.16.insert.shift = shl i64 %alloca1.sroa.22.16.insert.ext, 24
+ %alloca1.sroa.22.16.insert.insert = or i64 %alloca1.sroa.23.16.insert.insert, %alloca1.sroa.22.16.insert.shift
+ %alloca1.sroa.21.16.insert.ext = zext i8 %load133 to i64
+ %alloca1.sroa.21.16.insert.shift = shl i64 %alloca1.sroa.21.16.insert.ext, 16
+ %alloca1.sroa.20.16.insert.ext = zext i8 %load127 to i64
+ %alloca1.sroa.20.16.insert.shift = shl i64 %alloca1.sroa.20.16.insert.ext, 8
+ %alloca1.sroa.20.16.insert.mask = or i64 %alloca1.sroa.22.16.insert.insert, %alloca1.sroa.21.16.insert.shift
+ %alloca1.sroa.18.16.insert.ext = zext i8 %load121 to i64
+ %alloca1.sroa.18.16.insert.mask = or i64 %alloca1.sroa.20.16.insert.mask, %alloca1.sroa.20.16.insert.shift
+ %alloca1.sroa.18.16.insert.insert = or i64 %alloca1.sroa.18.16.insert.mask, %alloca1.sroa.18.16.insert.ext
+ %call230 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.18.16.insert.insert, i64 0, <16 x float> %call225, i32 0, i32 0, i32 0)
+ %alloca1.sroa.35.24.insert.ext = zext i8 %load211 to i64
+ %alloca1.sroa.35.24.insert.shift = shl i64 %alloca1.sroa.35.24.insert.ext, 56
+ %alloca1.sroa.34.24.insert.ext = zext i8 %load205 to i64
+ %alloca1.sroa.34.24.insert.shift = shl i64 %alloca1.sroa.34.24.insert.ext, 48
+ %alloca1.sroa.34.24.insert.insert = or i64 %alloca1.sroa.35.24.insert.shift, %alloca1.sroa.34.24.insert.shift
+ %alloca1.sroa.33.24.insert.ext = zext i8 %load199 to i64
+ %alloca1.sroa.33.24.insert.shift = shl i64 %alloca1.sroa.33.24.insert.ext, 40
+ %alloca1.sroa.33.24.insert.insert = or i64 %alloca1.sroa.34.24.insert.insert, %alloca1.sroa.33.24.insert.shift
+ %alloca1.sroa.32.24.insert.ext = zext i8 %load193 to i64
+ %alloca1.sroa.32.24.insert.shift = shl i64 %alloca1.sroa.32.24.insert.ext, 32
+ %alloca1.sroa.32.24.insert.insert = or i64 %alloca1.sroa.33.24.insert.insert, %alloca1.sroa.32.24.insert.shift
+ %alloca1.sroa.31.24.insert.ext = zext i8 %load187 to i64
+ %alloca1.sroa.31.24.insert.shift = shl i64 %alloca1.sroa.31.24.insert.ext, 24
+ %alloca1.sroa.31.24.insert.insert = or i64 %alloca1.sroa.32.24.insert.insert, %alloca1.sroa.31.24.insert.shift
+ %alloca1.sroa.30.24.insert.ext = zext i8 %load181 to i64
+ %alloca1.sroa.30.24.insert.shift = shl i64 %alloca1.sroa.30.24.insert.ext, 16
+ %alloca1.sroa.29.24.insert.ext = zext i8 %load175 to i64
+ %alloca1.sroa.29.24.insert.shift = shl i64 %alloca1.sroa.29.24.insert.ext, 8
+ %alloca1.sroa.29.24.insert.mask = or i64 %alloca1.sroa.31.24.insert.insert, %alloca1.sroa.30.24.insert.shift
+ %alloca1.sroa.27.24.insert.ext = zext i8 %load169 to i64
+ %alloca1.sroa.27.24.insert.mask = or i64 %alloca1.sroa.29.24.insert.mask, %alloca1.sroa.29.24.insert.shift
+ %alloca1.sroa.27.24.insert.insert = or i64 %alloca1.sroa.27.24.insert.mask, %alloca1.sroa.27.24.insert.ext
+ %call235 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.27.24.insert.insert, i64 0, <16 x float> %call230, i32 0, i32 0, i32 0)
+ ret <16 x float> %call235
+}
+
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #0
+
+attributes #0 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg
new file mode 100644
index 0000000000000..7c492428aec76
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AMDGPU" in config.root.targets:
+ config.unsupported = True
More information about the llvm-commits
mailing list