[llvm] ac94073 - [AMDGPU] Refine 64 bit misaligned LDS ops selection

Thu Apr 21 09:37:23 PDT 2022

Author: Stanislav Mekhanoshin
Date: 2022-04-21T09:37:16-07:00
New Revision: ac94073daa18687b76dc49a60bb2844799f28ee3

URL: https://github.com/llvm/llvm-project/commit/ac94073daa18687b76dc49a60bb2844799f28ee3
DIFF: https://github.com/llvm/llvm-project/commit/ac94073daa18687b76dc49a60bb2844799f28ee3.diff

LOG: [AMDGPU] Refine 64 bit misaligned LDS ops selection

Here is the performance data:
```
Using platform: AMD Accelerated Parallel Processing
Using device: gfx900:xnack-

ds_write_b64                       aligned by  8:  3.2 sec
ds_write2_b32                      aligned by  8:  3.2 sec
ds_write_b16 * 4                   aligned by  8:  7.0 sec
ds_write_b8 * 8                    aligned by  8: 13.2 sec
ds_write_b64                       aligned by  1:  7.3 sec
ds_write2_b32                      aligned by  1:  7.5 sec
ds_write_b16 * 4                   aligned by  1: 14.0 sec
ds_write_b8 * 8                    aligned by  1: 13.2 sec
ds_write_b64                       aligned by  2:  7.3 sec
ds_write2_b32                      aligned by  2:  7.5 sec
ds_write_b16 * 4                   aligned by  2:  7.1 sec
ds_write_b8 * 8                    aligned by  2: 13.3 sec
ds_write_b64                       aligned by  4:  4.6 sec
ds_write2_b32                      aligned by  4:  3.2 sec
ds_write_b16 * 4                   aligned by  4:  7.1 sec
ds_write_b8 * 8                    aligned by  4: 13.3 sec
ds_read_b64                        aligned by  8:  2.3 sec
ds_read2_b32                       aligned by  8:  2.2 sec
ds_read_u16 * 4                    aligned by  8:  4.8 sec
ds_read_u8 * 8                     aligned by  8:  8.6 sec
ds_read_b64                        aligned by  1:  4.4 sec
ds_read2_b32                       aligned by  1:  7.3 sec
ds_read_u16 * 4                    aligned by  1: 14.0 sec
ds_read_u8 * 8                     aligned by  1:  8.7 sec
ds_read_b64                        aligned by  2:  4.4 sec
ds_read2_b32                       aligned by  2:  7.3 sec
ds_read_u16 * 4                    aligned by  2:  4.8 sec
ds_read_u8 * 8                     aligned by  2:  8.7 sec
ds_read_b64                        aligned by  4:  4.4 sec
ds_read2_b32                       aligned by  4:  2.3 sec
ds_read_u16 * 4                    aligned by  4:  4.8 sec
ds_read_u8 * 8                     aligned by  4:  8.7 sec

Using platform: AMD Accelerated Parallel Processing
Using device: gfx1030

ds_write_b64                       aligned by  8:  4.4 sec
ds_write2_b32                      aligned by  8:  4.3 sec
ds_write_b16 * 4                   aligned by  8:  7.9 sec
ds_write_b8 * 8                    aligned by  8: 13.0 sec
ds_write_b64                       aligned by  1: 23.2 sec
ds_write2_b32                      aligned by  1: 23.1 sec
ds_write_b16 * 4                   aligned by  1: 44.0 sec
ds_write_b8 * 8                    aligned by  1: 13.0 sec
ds_write_b64                       aligned by  2: 23.2 sec
ds_write2_b32                      aligned by  2: 23.1 sec
ds_write_b16 * 4                   aligned by  2:  7.9 sec
ds_write_b8 * 8                    aligned by  2: 13.1 sec
ds_write_b64                       aligned by  4: 13.5 sec
ds_write2_b32                      aligned by  4:  4.3 sec
ds_write_b16 * 4                   aligned by  4:  7.9 sec
ds_write_b8 * 8                    aligned by  4: 13.1 sec
ds_read_b64                        aligned by  8:  3.5 sec
ds_read2_b32                       aligned by  8:  3.4 sec
ds_read_u16 * 4                    aligned by  8:  5.3 sec
ds_read_u8 * 8                     aligned by  8:  8.5 sec
ds_read_b64                        aligned by  1: 13.1 sec
ds_read2_b32                       aligned by  1: 22.7 sec
ds_read_u16 * 4                    aligned by  1: 43.9 sec
ds_read_u8 * 8                     aligned by  1:  7.9 sec
ds_read_b64                        aligned by  2: 13.1 sec
ds_read2_b32                       aligned by  2: 22.7 sec
ds_read_u16 * 4                    aligned by  2:  5.6 sec
ds_read_u8 * 8                     aligned by  2:  7.9 sec
ds_read_b64                        aligned by  4: 13.1 sec
ds_read2_b32                       aligned by  4:  3.4 sec
ds_read_u16 * 4                    aligned by  4:  5.6 sec
ds_read_u8 * 8                     aligned by  4:  7.9 sec
```

GFX10 exposes a different pattern for sub-DWORD load/store performance
than GFX9. On GFX9 it is faster to issue a single unaligned load or
store than a fully split b8 access, where on GFX10 even a full split
is better. However, this is a theoretical only gain because splitting
an access to a sub-dword level will require more registers and packing/
unpacking logic, so ignoring this option it is better to use a single
64 bit instruction on a misaligned data with the exception of 4 byte
aligned data where ds_read2_b32/ds_write2_b32 is better.

Differential Revision: https://reviews.llvm.org/D123956

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/DSInstructions.td
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/ds-alignment.ll
    llvm/test/CodeGen/AMDGPU/ds_read2.ll
    llvm/test/CodeGen/AMDGPU/ds_write2.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index bc2722e027594..049d8000d31ce 100644

--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -879,6 +879,15 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
 
 let SubtargetPredicate = HasUnalignedAccessMode in {
 
+// Select 64 bit loads and stores aligned less than 4 as a single ds_read_b64/
+// ds_write_b64 instruction as this is faster than ds_read2_b32/ds_write2_b32
+// which would be used otherwise. In this case a b32 access would still be
+// misaligned, but we will have 2 of them.
+foreach vt = VReg_64.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B64, vt, "load_align_less_than_4_local">;
+defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align_less_than_4_local">;
+}
+
 // Selection will split most of the unaligned 3 dword accesses due to performance
 // reasons when beneficial. Keep these two patterns for the rest of the cases.
 foreach vt = VReg_96.RegTypes in {

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 246376aabaf71..bd24364c35989 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1545,6 +1545,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
       // can do a 4 byte aligned, 8 byte access in a single operation using
       // ds_read2/write2_b32 with adjacent offsets.
       RequiredAlignment = Align(4);
+
+      if (Subtarget->hasUnalignedDSAccessEnabled()) {
+        // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
+        // ds_write2_b32 depending on the alignment. In either case with either
+        // alignment there is no faster way of doing this.
+        if (IsFast)
+          *IsFast = true;
+        return true;
+      }
+
       break;
     case 96:
       if (!Subtarget->hasDS96AndDS128())
@@ -1593,14 +1603,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
       break;
     }
 
-    if (IsFast) {
-      // FIXME: Lie it is fast if +unaligned-access-mode is passed so that
-      // DS accesses get vectorized. Do this only for sizes below 96 as
-      // b96 and b128 cases already properly handled.
-      // Remove Subtarget check once all sizes properly handled.
-      *IsFast = Alignment >= RequiredAlignment ||
-                (Subtarget->hasUnalignedDSAccessEnabled() && Size < 96);
-    }
+    if (IsFast)
+      *IsFast = Alignment >= RequiredAlignment;
 
     return Alignment >= RequiredAlignment ||
            Subtarget->hasUnalignedDSAccessEnabled();

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index f2589a55254f1..28b830903c59b 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -278,10 +278,10 @@ define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addr
 ; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; UNALIGNED-NEXT:    ds_read_b64 v[0:1], v0
 ; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
+; UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1]
 ; UNALIGNED-NEXT:    s_endpgm
   %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1
   store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1
@@ -334,10 +334,10 @@ define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addr
 ; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; UNALIGNED-NEXT:    ds_read_b64 v[0:1], v0
 ; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
 ; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
+; UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1]
 ; UNALIGNED-NEXT:    s_endpgm
   %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2
   store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 7af9c948a1e41..64aa283314e45 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -691,8 +691,8 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out,
 ; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT:    v_add3_u32 v0, s2, v2, 5
-; GFX9-UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v0, s2, v2
+; GFX9-UNALIGNED-NEXT:    ds_read_b64 v[0:1], v0 offset:5
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[0:1]
@@ -1530,10 +1530,9 @@ define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)*
 ;
 ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
 ; GFX9-UNALIGNED:       ; %bb.0: ; %entry
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x41
-; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-UNALIGNED-NEXT:    ds_read_b64 v[0:1], v2 offset:65
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-UNALIGNED-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 4d58ed525bf07..32ee858b56ba9 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -708,11 +708,9 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(double add
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, s4, v2
-; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v3, 5, v2
-; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v2, 9, v2
 ; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
-; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
+; GFX9-UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1] offset:5
+; GFX9-UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1] offset:9
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
@@ -1043,10 +1041,10 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
 ;
 ; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset:
 ; GFX9-UNALIGNED:       ; %bb.0: ; %entry
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x41
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x1c8
-; GFX9-UNALIGNED-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v1, 0x1c8
+; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1] offset:65
 ; GFX9-UNALIGNED-NEXT:    s_endpgm
 entry:
   store <2 x i32> <i32 123, i32 456>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1