[llvm] ac94073 - [AMDGPU] Refine 64 bit misaligned LDS ops selection
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 21 09:37:23 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-04-21T09:37:16-07:00
New Revision: ac94073daa18687b76dc49a60bb2844799f28ee3
URL: https://github.com/llvm/llvm-project/commit/ac94073daa18687b76dc49a60bb2844799f28ee3
DIFF: https://github.com/llvm/llvm-project/commit/ac94073daa18687b76dc49a60bb2844799f28ee3.diff
LOG: [AMDGPU] Refine 64 bit misaligned LDS ops selection
Here is the performance data:
```
Using platform: AMD Accelerated Parallel Processing
Using device: gfx900:xnack-
ds_write_b64 aligned by 8: 3.2 sec
ds_write2_b32 aligned by 8: 3.2 sec
ds_write_b16 * 4 aligned by 8: 7.0 sec
ds_write_b8 * 8 aligned by 8: 13.2 sec
ds_write_b64 aligned by 1: 7.3 sec
ds_write2_b32 aligned by 1: 7.5 sec
ds_write_b16 * 4 aligned by 1: 14.0 sec
ds_write_b8 * 8 aligned by 1: 13.2 sec
ds_write_b64 aligned by 2: 7.3 sec
ds_write2_b32 aligned by 2: 7.5 sec
ds_write_b16 * 4 aligned by 2: 7.1 sec
ds_write_b8 * 8 aligned by 2: 13.3 sec
ds_write_b64 aligned by 4: 4.6 sec
ds_write2_b32 aligned by 4: 3.2 sec
ds_write_b16 * 4 aligned by 4: 7.1 sec
ds_write_b8 * 8 aligned by 4: 13.3 sec
ds_read_b64 aligned by 8: 2.3 sec
ds_read2_b32 aligned by 8: 2.2 sec
ds_read_u16 * 4 aligned by 8: 4.8 sec
ds_read_u8 * 8 aligned by 8: 8.6 sec
ds_read_b64 aligned by 1: 4.4 sec
ds_read2_b32 aligned by 1: 7.3 sec
ds_read_u16 * 4 aligned by 1: 14.0 sec
ds_read_u8 * 8 aligned by 1: 8.7 sec
ds_read_b64 aligned by 2: 4.4 sec
ds_read2_b32 aligned by 2: 7.3 sec
ds_read_u16 * 4 aligned by 2: 4.8 sec
ds_read_u8 * 8 aligned by 2: 8.7 sec
ds_read_b64 aligned by 4: 4.4 sec
ds_read2_b32 aligned by 4: 2.3 sec
ds_read_u16 * 4 aligned by 4: 4.8 sec
ds_read_u8 * 8 aligned by 4: 8.7 sec
Using platform: AMD Accelerated Parallel Processing
Using device: gfx1030
ds_write_b64 aligned by 8: 4.4 sec
ds_write2_b32 aligned by 8: 4.3 sec
ds_write_b16 * 4 aligned by 8: 7.9 sec
ds_write_b8 * 8 aligned by 8: 13.0 sec
ds_write_b64 aligned by 1: 23.2 sec
ds_write2_b32 aligned by 1: 23.1 sec
ds_write_b16 * 4 aligned by 1: 44.0 sec
ds_write_b8 * 8 aligned by 1: 13.0 sec
ds_write_b64 aligned by 2: 23.2 sec
ds_write2_b32 aligned by 2: 23.1 sec
ds_write_b16 * 4 aligned by 2: 7.9 sec
ds_write_b8 * 8 aligned by 2: 13.1 sec
ds_write_b64 aligned by 4: 13.5 sec
ds_write2_b32 aligned by 4: 4.3 sec
ds_write_b16 * 4 aligned by 4: 7.9 sec
ds_write_b8 * 8 aligned by 4: 13.1 sec
ds_read_b64 aligned by 8: 3.5 sec
ds_read2_b32 aligned by 8: 3.4 sec
ds_read_u16 * 4 aligned by 8: 5.3 sec
ds_read_u8 * 8 aligned by 8: 8.5 sec
ds_read_b64 aligned by 1: 13.1 sec
ds_read2_b32 aligned by 1: 22.7 sec
ds_read_u16 * 4 aligned by 1: 43.9 sec
ds_read_u8 * 8 aligned by 1: 7.9 sec
ds_read_b64 aligned by 2: 13.1 sec
ds_read2_b32 aligned by 2: 22.7 sec
ds_read_u16 * 4 aligned by 2: 5.6 sec
ds_read_u8 * 8 aligned by 2: 7.9 sec
ds_read_b64 aligned by 4: 13.1 sec
ds_read2_b32 aligned by 4: 3.4 sec
ds_read_u16 * 4 aligned by 4: 5.6 sec
ds_read_u8 * 8 aligned by 4: 7.9 sec
```
GFX10 exposes a different pattern for sub-DWORD load/store performance
than GFX9. On GFX9 it is faster to issue a single unaligned load or
store than a fully split b8 access, where on GFX10 even a full split
is better. However, this is a theoretical only gain because splitting
an access to a sub-dword level will require more registers and packing/
unpacking logic, so ignoring this option it is better to use a single
64 bit instruction on a misaligned data with the exception of 4 byte
aligned data where ds_read2_b32/ds_write2_b32 is better.
Differential Revision: https://reviews.llvm.org/D123956
Added:
Modified:
llvm/lib/Target/AMDGPU/DSInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/ds-alignment.ll
llvm/test/CodeGen/AMDGPU/ds_read2.ll
llvm/test/CodeGen/AMDGPU/ds_write2.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index bc2722e027594..049d8000d31ce 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -879,6 +879,15 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
let SubtargetPredicate = HasUnalignedAccessMode in {
+// Select 64 bit loads and stores aligned less than 4 as a single ds_read_b64/
+// ds_write_b64 instruction as this is faster than ds_read2_b32/ds_write2_b32
+// which would be used otherwise. In this case a b32 access would still be
+// misaligned, but we will have 2 of them.
+foreach vt = VReg_64.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B64, vt, "load_align_less_than_4_local">;
+defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align_less_than_4_local">;
+}
+
// Selection will split most of the unaligned 3 dword accesses due to performance
// reasons when beneficial. Keep these two patterns for the rest of the cases.
foreach vt = VReg_96.RegTypes in {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 246376aabaf71..bd24364c35989 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1545,6 +1545,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
// can do a 4 byte aligned, 8 byte access in a single operation using
// ds_read2/write2_b32 with adjacent offsets.
RequiredAlignment = Align(4);
+
+ if (Subtarget->hasUnalignedDSAccessEnabled()) {
+ // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
+ // ds_write2_b32 depending on the alignment. In either case with either
+ // alignment there is no faster way of doing this.
+ if (IsFast)
+ *IsFast = true;
+ return true;
+ }
+
break;
case 96:
if (!Subtarget->hasDS96AndDS128())
@@ -1593,14 +1603,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
break;
}
- if (IsFast) {
- // FIXME: Lie it is fast if +unaligned-access-mode is passed so that
- // DS accesses get vectorized. Do this only for sizes below 96 as
- // b96 and b128 cases already properly handled.
- // Remove Subtarget check once all sizes properly handled.
- *IsFast = Alignment >= RequiredAlignment ||
- (Subtarget->hasUnalignedDSAccessEnabled() && Size < 96);
- }
+ if (IsFast)
+ *IsFast = Alignment >= RequiredAlignment;
return Alignment >= RequiredAlignment ||
Subtarget->hasUnalignedDSAccessEnabled();
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index f2589a55254f1..28b830903c59b 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -278,10 +278,10 @@ define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addr
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0
; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
+; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1]
; UNALIGNED-NEXT: s_endpgm
%val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1
store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1
@@ -334,10 +334,10 @@ define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addr
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0
; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
+; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1]
; UNALIGNED-NEXT: s_endpgm
%val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2
store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 7af9c948a1e41..64aa283314e45 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -691,8 +691,8 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out,
; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s2, v2, 5
-; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s2, v2
+; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 offset:5
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[0:1]
@@ -1530,10 +1530,9 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
;
; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
; GFX9-UNALIGNED: ; %bb.0: ; %entry
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41
-; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-UNALIGNED-NEXT: ds_read_b64 v[0:1], v2 offset:65
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-UNALIGNED-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 4d58ed525bf07..32ee858b56ba9 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -708,11 +708,9 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double add
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s4, v2
-; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2
-; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-UNALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
-; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
+; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5
+; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:9
; GFX9-UNALIGNED-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
@@ -1043,10 +1041,10 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
;
; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
; GFX9-UNALIGNED: ; %bb.0: ; %entry
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x41
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x1c8
-; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x1c8
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:65
; GFX9-UNALIGNED-NEXT: s_endpgm
entry:
store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
More information about the llvm-commits
mailing list