[llvm] f6462a2 - [AMDGPU] Split unaligned 4 DWORD DS operations
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 12 16:07:20 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-04-12T16:07:13-07:00
New Revision: f6462a26f03fa0d0c2b36b305d1af91b8841485f
URL: https://github.com/llvm/llvm-project/commit/f6462a26f03fa0d0c2b36b305d1af91b8841485f
DIFF: https://github.com/llvm/llvm-project/commit/f6462a26f03fa0d0c2b36b305d1af91b8841485f.diff
LOG: [AMDGPU] Split unaligned 4 DWORD DS operations
Similarly to 3 DWORD operations it is better for performance
to split unlaligned operations as long a these are at least
DWORD alignmened. Performance data:
```
Using platform: AMD Accelerated Parallel Processing
Using device: gfx900:xnack-
ds_write_b128 aligned by 16: 4.9 sec
ds_write2_b64 aligned by 16: 5.1 sec
ds_write2_b32 * 2 aligned by 16: 5.5 sec
ds_write_b128 aligned by 1: 8.1 sec
ds_write2_b64 aligned by 1: 8.7 sec
ds_write2_b32 * 2 aligned by 1: 14.0 sec
ds_write_b128 aligned by 2: 8.1 sec
ds_write2_b64 aligned by 2: 8.7 sec
ds_write2_b32 * 2 aligned by 2: 14.0 sec
ds_write_b128 aligned by 4: 5.6 sec
ds_write2_b64 aligned by 4: 8.7 sec
ds_write2_b32 * 2 aligned by 4: 5.6 sec
ds_write_b128 aligned by 8: 5.6 sec
ds_write2_b64 aligned by 8: 5.1 sec
ds_write2_b32 * 2 aligned by 8: 5.6 sec
ds_read_b128 aligned by 16: 3.8 sec
ds_read2_b64 aligned by 16: 3.8 sec
ds_read2_b32 * 2 aligned by 16: 4.0 sec
ds_read_b128 aligned by 1: 4.6 sec
ds_read2_b64 aligned by 1: 8.1 sec
ds_read2_b32 * 2 aligned by 1: 14.0 sec
ds_read_b128 aligned by 2: 4.6 sec
ds_read2_b64 aligned by 2: 8.1 sec
ds_read2_b32 * 2 aligned by 2: 14.0 sec
ds_read_b128 aligned by 4: 4.6 sec
ds_read2_b64 aligned by 4: 8.1 sec
ds_read2_b32 * 2 aligned by 4: 4.0 sec
ds_read_b128 aligned by 8: 4.6 sec
ds_read2_b64 aligned by 8: 3.8 sec
ds_read2_b32 * 2 aligned by 8: 4.0 sec
Using platform: AMD Accelerated Parallel Processing
Using device: gfx1030
ds_write_b128 aligned by 16: 6.2 sec
ds_write2_b64 aligned by 16: 7.1 sec
ds_write2_b32 * 2 aligned by 16: 7.6 sec
ds_write_b128 aligned by 1: 24.1 sec
ds_write2_b64 aligned by 1: 25.2 sec
ds_write2_b32 * 2 aligned by 1: 43.7 sec
ds_write_b128 aligned by 2: 24.1 sec
ds_write2_b64 aligned by 2: 25.1 sec
ds_write2_b32 * 2 aligned by 2: 43.7 sec
ds_write_b128 aligned by 4: 14.4 sec
ds_write2_b64 aligned by 4: 25.1 sec
ds_write2_b32 * 2 aligned by 4: 7.6 sec
ds_write_b128 aligned by 8: 14.4 sec
ds_write2_b64 aligned by 8: 7.1 sec
ds_write2_b32 * 2 aligned by 8: 7.6 sec
ds_read_b128 aligned by 16: 6.2 sec
ds_read2_b64 aligned by 16: 6.3 sec
ds_read2_b32 * 2 aligned by 16: 7.5 sec
ds_read_b128 aligned by 1: 12.5 sec
ds_read2_b64 aligned by 1: 24.0 sec
ds_read2_b32 * 2 aligned by 1: 43.6 sec
ds_read_b128 aligned by 2: 12.5 sec
ds_read2_b64 aligned by 2: 24.0 sec
ds_read2_b32 * 2 aligned by 2: 43.6 sec
ds_read_b128 aligned by 4: 12.5 sec
ds_read2_b64 aligned by 4: 24.0 sec
ds_read2_b32 * 2 aligned by 4: 7.5 sec
ds_read_b128 aligned by 8: 12.5 sec
ds_read2_b64 aligned by 8: 6.3 sec
ds_read2_b32 * 2 aligned by 8: 7.5 sec
```
Differential Revision: https://reviews.llvm.org/D123634
Added:
Modified:
llvm/lib/Target/AMDGPU/DSInstructions.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
llvm/test/CodeGen/AMDGPU/ds-alignment.ll
llvm/test/CodeGen/AMDGPU/ds_write2.ll
llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index f56d3aaa00f7e..1fb36991403ab 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -884,8 +884,14 @@ defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
}
-// For performance reasons, *do not* select ds_read_b128/ds_write_b128 for unaligned
-// accesses.
+// Select 128 bit loads and stores aligned less than 4 as a single ds_read_b128/
+// ds_write_b128 as a single instruction as this is faster than ds_read2_b64/
+// ds_write2_b64 which would be used otherwise. In this case a b64 access would
+// still be misaligned, but we will have 2 of them.
+foreach vt = VReg_128.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B128, vt, "load_align_less_than_4_local">;
+defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
+}
} // End SubtargetPredicate = HasUnalignedAccessMode
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 56238f2a149e9..bb65557ae082e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1574,6 +1574,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
// gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
// single operation using ds_read2/write2_b64.
RequiredAlignment = Align(8);
+
+ if (Subtarget->hasUnalignedDSAccessEnabled()) {
+ // Naturally aligned access is fastest. However, also report it is Fast
+ // if memory is aligned less than DWORD. A narrow load or store will be
+ // be equally slow as a single ds_read_b128/ds_write_b128, but there
+ // will be more of them, so overall we will pay less penalty issuing a
+ // single instruction.
+ if (IsFast)
+ *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+ return true;
+ }
+
break;
default:
if (Size > 32)
@@ -1584,9 +1596,11 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
if (IsFast) {
// FIXME: Lie it is fast if +unaligned-access-mode is passed so that
- // DS accesses get vectorized.
+ // DS accesses get vectorized. Do this only for sizes below 96 as
+ // b96 and b128 cases already properly handled.
+ // Remove Subtarget check once all sizes properly handled.
*IsFast = Alignment >= RequiredAlignment ||
- Subtarget->hasUnalignedDSAccessEnabled();
+ (Subtarget->hasUnalignedDSAccessEnabled() && Size < 96);
}
return Alignment >= RequiredAlignment ||
@@ -1657,8 +1671,22 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
return false;
}
- return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
- Alignment, Flags, IsFast);
+ bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
+ Alignment, Flags, IsFast);
+
+ if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
+ (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
+ // Lie it is fast if +unaligned-access-mode is passed so that DS accesses
+ // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a
+ // misaligned data which is faster than a pair of ds_read_b*/ds_write_b*
+ // which would be equally misaligned.
+ // This is only used by the common passes, selection always calls the
+ // allowsMisalignedMemoryAccessesImpl version.
+ *IsFast = true;
+ }
+
+ return Allow;
}
EVT SITargetLowering::getOptimalMemOpType(
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index ba4a90e7cab02..aa4a6efeb5a23 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -585,6 +585,34 @@ def store_align16_local_m0 : PatFrag <(ops node:$value, node:$ptr),
let IsTruncStore = 0;
}
+let PredicateCode = [{return cast<MemSDNode>(N)->getAlignment() < 4;}],
+ GISelPredicateCode = [{return (*MI.memoperands_begin())->getAlign() < 4;}],
+ AddressSpaces = [ AddrSpaces.Local ] in {
+def load_align_less_than_4_local : PatFrag<(ops node:$ptr),
+ (load_local node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+}
+
+def load_align_less_than_4_local_m0 : PatFrag<(ops node:$ptr),
+ (load_local_m0 node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+}
+
+def store_align_less_than_4_local : PatFrag <(ops node:$value, node:$ptr),
+ (store_local node:$value, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
+
+def store_align_less_than_4_local_m0 : PatFrag <(ops node:$value, node:$ptr),
+ (store_local_m0 node:$value, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
+}
+
let AddressSpaces = StoreAddress_local.AddrSpaces in {
def atomic_store_local_8_m0 : PatFrag <
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index 94a618af642ec..d7b28821a7f38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -11,7 +11,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v4i32_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NEXT: ds_read_b128 v[0:3], v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -153,7 +153,7 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x)
; GFX9-LABEL: store_lds_v4i32_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; GFX9-NEXT: ds_write_b128 v0, v[1:4]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index e3b6dce9f9213..f2589a55254f1 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -798,10 +798,10 @@ define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> add
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
; UNALIGNED-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1
@@ -874,10 +874,10 @@ define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> add
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
; UNALIGNED-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2
@@ -899,16 +899,30 @@ define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> add
; ALIGNED-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
; ALIGNED-NEXT: s_endpgm
;
-; UNALIGNED-LABEL: ds16align4:
-; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
-; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
-; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
-; UNALIGNED-NEXT: s_endpgm
+; UNALIGNED-SDAG-LABEL: ds16align4:
+; UNALIGNED-SDAG: ; %bb.0:
+; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3
+; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
+; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3
+; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
+; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset1:1
+; UNALIGNED-SDAG-NEXT: s_endpgm
+;
+; UNALIGNED-GISEL-LABEL: ds16align4:
+; UNALIGNED-GISEL: ; %bb.0:
+; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; UNALIGNED-GISEL-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; UNALIGNED-GISEL-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 268e8bbeb8a12..4d58ed525bf07 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -987,14 +987,15 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v4, v0, 4, s4
+; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
+; GFX9-UNALIGNED-NEXT: ds_write2_b32 v0, v3, v4 offset1:1
; GFX9-UNALIGNED-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
index 2b07e3e7a9b4f..c1e69d67460c4 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -22,12 +22,10 @@ bb:
}
; GCN-LABEL: test_local_misaligned_v4:
-; ALIGNED-DAG: ds_read2_b32
-; ALIGNED-DAG: ds_read2_b32
-; ALIGNED-DAG: ds_write2_b32
-; ALIGNED-DAG: ds_write2_b32
-; UNALIGNED-DAG: ds_read2_b64
-; UNALIGNED-DAG: ds_write2_b64
+; GCN-DAG: ds_read2_b32
+; GCN-DAG: ds_read2_b32
+; GCN-DAG: ds_write2_b32
+; GCN-DAG: ds_write2_b32
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list