[llvm] 3870b36 - [AMDGPU] Split unaligned 3 DWORD DS operations

Tue Apr 12 08:05:41 PDT 2022

Author: Stanislav Mekhanoshin
Date: 2022-04-12T07:52:39-07:00
New Revision: 3870b3602552d118717800a71e08b7209ea65996

URL: https://github.com/llvm/llvm-project/commit/3870b3602552d118717800a71e08b7209ea65996
DIFF: https://github.com/llvm/llvm-project/commit/3870b3602552d118717800a71e08b7209ea65996.diff

LOG: [AMDGPU] Split unaligned 3 DWORD DS operations

I have written a minitest to check the performance. Overall
the benefit of aligned b96 operations on data which is not
known but happens to be aligned is small, while performance
hit of using b96 operations on a really unaligned memory is
high.

The only exception is when data is not aligned even by 4, it
is better to use b96 in this case.

Here is the test output on Vega and Navi:

```
Using platform: AMD Accelerated Parallel Processing
Using device: gfx900:xnack-

ds_write_b96                                  aligned: 3.4 sec
ds_write_b32 + ds_write_b64                   aligned: 4.5 sec
ds_write_b32 * 3                              aligned: 4.8 sec
ds_write_b96                          misaligned by 1: 4.8 sec
ds_write_b32 + ds_write_b64           misaligned by 1: 7.2 sec
ds_write_b32 * 3                      misaligned by 1: 10.0 sec
ds_write_b96                          misaligned by 2: 4.8 sec
ds_write_b32 + ds_write_b64           misaligned by 2: 7.2 sec
ds_write_b32 * 3                      misaligned by 2: 10.1 sec
ds_write_b96                          misaligned by 4: 4.8 sec
ds_write_b32 + ds_write_b64           misaligned by 4: 4.2 sec
ds_write_b32 * 3                      misaligned by 4: 4.9 sec
ds_write_b96                          misaligned by 8: 4.8 sec
ds_write_b32 + ds_write_b64           misaligned by 8: 4.6 sec
ds_write_b32 * 3                      misaligned by 8: 4.9 sec
ds_read_b96                                   aligned: 3.3 sec
ds_read_b32 + ds_read_b64                     aligned: 4.9 sec
ds_read_b32 * 3                               aligned: 2.6 sec
ds_read_b96                           misaligned by 1: 4.1 sec
ds_read_b32 + ds_read_b64             misaligned by 1: 7.2 sec
ds_read_b32 * 3                       misaligned by 1: 10.1 sec
ds_read_b96                           misaligned by 2: 4.1 sec
ds_read_b32 + ds_read_b64             misaligned by 2: 7.2 sec
ds_read_b32 * 3                       misaligned by 2: 10.1 sec
ds_read_b96                           misaligned by 4: 4.1 sec
ds_read_b32 + ds_read_b64             misaligned by 4: 2.6 sec
ds_read_b32 * 3                       misaligned by 4: 2.6 sec
ds_read_b96                           misaligned by 8: 4.1 sec
ds_read_b32 + ds_read_b64             misaligned by 8: 4.9 sec
ds_read_b32 * 3                       misaligned by 8: 2.6 sec

Using platform: AMD Accelerated Parallel Processing
Using device: gfx1030

ds_write_b96                                  aligned: 4.1 sec
ds_write_b32 + ds_write_b64                   aligned: 13.0 sec
ds_write_b32 * 3                              aligned: 4.5 sec
ds_write_b96                          misaligned by 1: 12.5 sec
ds_write_b32 + ds_write_b64           misaligned by 1: 22.0 sec
ds_write_b32 * 3                      misaligned by 1: 31.5 sec
ds_write_b96                          misaligned by 2: 12.4 sec
ds_write_b32 + ds_write_b64           misaligned by 2: 22.0 sec
ds_write_b32 * 3                      misaligned by 2: 31.5 sec
ds_write_b96                          misaligned by 4: 12.4 sec
ds_write_b32 + ds_write_b64           misaligned by 4: 4.0 sec
ds_write_b32 * 3                      misaligned by 4: 4.5 sec
ds_write_b96                          misaligned by 8: 12.4 sec
ds_write_b32 + ds_write_b64           misaligned by 8: 13.0 sec
ds_write_b32 * 3                      misaligned by 8: 4.5 sec
ds_read_b96                                   aligned: 3.8 sec
ds_read_b32 + ds_read_b64                     aligned: 12.8 sec
ds_read_b32 * 3                               aligned: 4.4 sec
ds_read_b96                           misaligned by 1: 10.9 sec
ds_read_b32 + ds_read_b64             misaligned by 1: 21.8 sec
ds_read_b32 * 3                       misaligned by 1: 31.5 sec
ds_read_b96                           misaligned by 2: 10.9 sec
ds_read_b32 + ds_read_b64             misaligned by 2: 21.9 sec
ds_read_b32 * 3                       misaligned by 2: 31.5 sec
ds_read_b96                           misaligned by 4: 10.9 sec
ds_read_b32 + ds_read_b64             misaligned by 4: 3.8 sec
ds_read_b32 * 3                       misaligned by 4: 4.5 sec
ds_read_b96                           misaligned by 8: 10.9 sec
ds_read_b32 + ds_read_b64             misaligned by 8: 12.8 sec
ds_read_b32 * 3                       misaligned by 8: 4.5 sec
```

Fixes: SWDEV-330802

Differential Revision: https://reviews.llvm.org/D123524

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/DSInstructions.td
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/ds-alignment.ll
    llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 43bf65d19bd86..f56d3aaa00f7e 100644

--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -877,8 +877,8 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
 
 let SubtargetPredicate = HasUnalignedAccessMode in {
 
-// FIXME: From performance point of view, is ds_read_b96/ds_write_b96 better choice
-// for unaligned accesses?
+// Selection will split most of the unaligned 3 dword accesses due to performance
+// reasons when beneficial. Keep these two patterns for the rest of the cases.
 foreach vt = VReg_96.RegTypes in {
 defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
 defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8b25b16abe923..7fb0eb2d437ad 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1553,6 +1553,18 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
       // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
       // gfx8 and older.
       RequiredAlignment = Align(16);
+
+      if (Subtarget->hasUnalignedDSAccessEnabled()) {
+        // Naturally aligned access is fastest. However, also report it is Fast
+        // if memory is aligned less than DWORD. A narrow load or store will be
+        // be equally slow as a single ds_read_b96/ds_write_b96, but there will
+        // be more of them, so overall we will pay less penalty issuing a single
+        // instruction.
+        if (IsFast)
+          *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+        return true;
+      }
+
       break;
     case 128:
       if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index 00e550a29c6be..f6ff94e2406b0 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -566,23 +566,11 @@ define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
 ; ALIGNED-NEXT:    ds_write_b32 v3, v2 offset:8
 ; ALIGNED-NEXT:    s_endpgm
-;
-; UNALIGNED-LABEL: ds12align4:
-; UNALIGNED:       ; %bb.0:
-; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
-; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
-; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
-; UNALIGNED-NEXT:    s_endpgm
   %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4
   store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4
   ret void
 }
 
-; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64?
 define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
 ; ALIGNED-SDAG-LABEL: ds12align8:
 ; ALIGNED-SDAG:       ; %bb.0:
@@ -611,17 +599,6 @@ define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
 ; ALIGNED-GISEL-NEXT:    ds_write_b32 v3, v2 offset:8
 ; ALIGNED-GISEL-NEXT:    s_endpgm
-;
-; UNALIGNED-LABEL: ds12align8:
-; UNALIGNED:       ; %bb.0:
-; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
-; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
-; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
-; UNALIGNED-NEXT:    s_endpgm
   %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8
   store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
index b69765750ee75..2b07e3e7a9b4f 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -47,12 +47,10 @@ bb:
 }
 
 ; GCN-LABEL: test_local_misaligned_v3:
-; ALIGNED-DAG: ds_read2_b32
-; ALIGNED-DAG: ds_read_b32
-; ALIGNED-DAG: ds_write2_b32
-; ALIGNED-DAG: ds_write_b32
-; UNALIGNED-DAG: ds_read_b96
-; UNALIGNED-DAG: ds_write_b96
+; GCN-DAG: ds_read2_b32
+; GCN-DAG: ds_read_b32
+; GCN-DAG: ds_write2_b32
+; GCN-DAG: ds_write_b32
 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
 bb:
   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()