[llvm] 28eb9ed - [AMDGPU] Fine tune LDS misaligned access speed

Mon Nov 28 16:12:18 PST 2022

Author: Stanislav Mekhanoshin
Date: 2022-11-28T16:12:02-08:00
New Revision: 28eb9ed3bb5b73e1e395ab0d86f58fe3a2e9335a

URL: https://github.com/llvm/llvm-project/commit/28eb9ed3bb5b73e1e395ab0d86f58fe3a2e9335a
DIFF: https://github.com/llvm/llvm-project/commit/28eb9ed3bb5b73e1e395ab0d86f58fe3a2e9335a.diff

LOG: [AMDGPU] Fine tune LDS misaligned access speed

Differential Revision: https://reviews.llvm.org/D124219

Added: 
    llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll

Modified: 
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74fd55b165285..198dee022532c 100644

--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1423,8 +1423,21 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
         // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
         // ds_write2_b32 depending on the alignment. In either case with either
         // alignment there is no faster way of doing this.
+
+        // The numbers returned here and below are not additive, it is a 'speed
+        // rank'. They are just meant to be compared to decide if a certain way
+        // of lowering an operation is faster than another. For that purpose
+        // naturally aligned operation gets it bitsize to indicate that "it
+        // operates with a speed comparable to N-bit wide load". With the full
+        // alignment ds128 is slower than ds96 for example. If underaligned it
+        // is comparable to a speed of a single dword access, which would then
+        // mean 32 < 128 and it is faster to issue a wide load regardless.
+        // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
+        // wider load which will not be aligned anymore the latter is slower.
         if (IsFast)
-          *IsFast = 1;
+          *IsFast = (Alignment >= RequiredAlignment) ? 64
+                    : (Alignment < Align(4))         ? 32
+                                                     : 1;
         return true;
       }
 
@@ -1442,8 +1455,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
         // be equally slow as a single ds_read_b96/ds_write_b96, but there will
         // be more of them, so overall we will pay less penalty issuing a single
         // instruction.
+
+        // See comment on the values above.
         if (IsFast)
-          *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+          *IsFast = (Alignment >= RequiredAlignment) ? 96
+                    : (Alignment < Align(4))         ? 32
+                                                     : 1;
         return true;
       }
 
@@ -1463,8 +1480,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
         // be equally slow as a single ds_read_b128/ds_write_b128, but there
         // will be more of them, so overall we will pay less penalty issuing a
         // single instruction.
+
+        // See comment on the values above.
         if (IsFast)
-          *IsFast= Alignment >= RequiredAlignment || Alignment < Align(4);
+          *IsFast = (Alignment >= RequiredAlignment) ? 128
+                    : (Alignment < Align(4))         ? 32
+                                                     : 1;
         return true;
       }
 
@@ -1476,8 +1497,11 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
       break;
     }
 
+    // See comment on the values above.
+    // Note that we have a single-dword or sub-dword here, so if underaligned
+    // it is a slowest possible access, hence returned value is 0.
     if (IsFast)
-      *IsFast = Alignment >= RequiredAlignment;
+      *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
 
     return Alignment >= RequiredAlignment ||
            Subtarget->hasUnalignedDSAccessEnabled();
@@ -1535,22 +1559,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
 bool SITargetLowering::allowsMisalignedMemoryAccesses(
     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     unsigned *IsFast) const {
-  bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
-                                                  Alignment, Flags, IsFast);
-
-  if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
-      (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
-       AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
-    // Lie it is fast if +unaligned-access-mode is passed so that DS accesses
-    // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a
-    // misaligned data which is faster than a pair of ds_read_b*/ds_write_b*
-    // which would be equally misaligned.
-    // This is only used by the common passes, selection always calls the
-    // allowsMisalignedMemoryAccessesImpl version.
-    *IsFast= 1;
-  }
-
-  return Allow;
+  return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
+                                            Alignment, Flags, IsFast);
 }
 
 EVT SITargetLowering::getOptimalMemOpType(
@@ -8785,7 +8795,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     auto Flags = Load->getMemOperand()->getFlags();
     if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
                                            Load->getAlign(), Flags, &Fast) &&
-        Fast)
+        Fast > 1)
       return SDValue();
 
     if (MemVT.isVector())
@@ -9284,7 +9294,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     auto Flags = Store->getMemOperand()->getFlags();
     if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
                                            Store->getAlign(), Flags, &Fast) &&
-        Fast)
+        Fast > 1)
       return SDValue();
 
     if (VT.isVector())

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
new file mode 100644
index 0000000000000..fc75ef69be032
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
@@ -0,0 +1,107 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefix=GCN %s
+
+; Check that vectorizer does not create slow misaligned loads
+
+; GCN-LABEL: {{^}}ds1align1:
+; GCN-COUNT-2: ds_read_u8
+; GCN-COUNT-2: ds_write_b8
+define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
+  %val1 = load i8, i8 addrspace(3)* %in, align 1
+  %gep1 = getelementptr i8, i8 addrspace(3)* %in, i32 1
+  %val2 = load i8, i8 addrspace(3)* %gep1, align 1
+  store i8 %val1, i8 addrspace(3)* %out, align 1
+  %gep2 = getelementptr i8, i8 addrspace(3)* %out, i32 1
+  store i8 %val2, i8 addrspace(3)* %gep2, align 1
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds2align2:
+; GCN-COUNT-2: ds_read_u16
+; GCN-COUNT-2: ds_write_b16
+define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
+  %val1 = load i16, i16 addrspace(3)* %in, align 2
+  %gep1 = getelementptr i16, i16 addrspace(3)* %in, i32 1
+  %val2 = load i16, i16 addrspace(3)* %gep1, align 2
+  store i16 %val1, i16 addrspace(3)* %out, align 2
+  %gep2 = getelementptr i16, i16 addrspace(3)* %out, i32 1
+  store i16 %val2, i16 addrspace(3)* %gep2, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds4align4:
+; GCN: ds_read2_b32
+; GCN: ds_write2_b32
+define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+  %val1 = load i32, i32 addrspace(3)* %in, align 4
+  %gep1 = getelementptr i32, i32 addrspace(3)* %in, i32 1
+  %val2 = load i32, i32 addrspace(3)* %gep1, align 4
+  store i32 %val1, i32 addrspace(3)* %out, align 4
+  %gep2 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+  store i32 %val2, i32 addrspace(3)* %gep2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds8align8:
+; GCN: ds_read2_b64
+; GCN: ds_write2_b64
+define amdgpu_kernel void @ds8align8(i64 addrspace(3)* %in, i64 addrspace(3)* %out) {
+  %val1 = load i64, i64 addrspace(3)* %in, align 8
+  %gep1 = getelementptr i64, i64 addrspace(3)* %in, i64 1
+  %val2 = load i64, i64 addrspace(3)* %gep1, align 8
+  store i64 %val1, i64 addrspace(3)* %out, align 8
+  %gep2 = getelementptr i64, i64 addrspace(3)* %out, i64 1
+  store i64 %val2, i64 addrspace(3)* %gep2, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds1align2:
+; GCN: ds_read_u16
+; GCN: ds_write_b16
+define amdgpu_kernel void @ds1align2(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
+  %val1 = load i8, i8 addrspace(3)* %in, align 2
+  %gep1 = getelementptr i8, i8 addrspace(3)* %in, i32 1
+  %val2 = load i8, i8 addrspace(3)* %gep1, align 2
+  store i8 %val1, i8 addrspace(3)* %out, align 2
+  %gep2 = getelementptr i8, i8 addrspace(3)* %out, i32 1
+  store i8 %val2, i8 addrspace(3)* %gep2, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds2align4:
+; GCN: ds_read_b32
+; GCN: ds_write_b32
+define amdgpu_kernel void @ds2align4(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
+  %val1 = load i16, i16 addrspace(3)* %in, align 4
+  %gep1 = getelementptr i16, i16 addrspace(3)* %in, i32 1
+  %val2 = load i16, i16 addrspace(3)* %gep1, align 4
+  store i16 %val1, i16 addrspace(3)* %out, align 4
+  %gep2 = getelementptr i16, i16 addrspace(3)* %out, i32 1
+  store i16 %val2, i16 addrspace(3)* %gep2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds4align8:
+; GCN: ds_read_b64
+; GCN: ds_write_b64
+define amdgpu_kernel void @ds4align8(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+  %val1 = load i32, i32 addrspace(3)* %in, align 8
+  %gep1 = getelementptr i32, i32 addrspace(3)* %in, i32 1
+  %val2 = load i32, i32 addrspace(3)* %gep1, align 8
+  store i32 %val1, i32 addrspace(3)* %out, align 8
+  %gep2 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+  store i32 %val2, i32 addrspace(3)* %gep2, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds8align16:
+; GCN: ds_read_b128
+; GCN: ds_write_b128
+define amdgpu_kernel void @ds8align16(i64 addrspace(3)* %in, i64 addrspace(3)* %out) {
+  %val1 = load i64, i64 addrspace(3)* %in, align 16
+  %gep1 = getelementptr i64, i64 addrspace(3)* %in, i64 1
+  %val2 = load i64, i64 addrspace(3)* %gep1, align 16
+  store i64 %val1, i64 addrspace(3)* %out, align 16
+  %gep2 = getelementptr i64, i64 addrspace(3)* %out, i64 1
+  store i64 %val2, i64 addrspace(3)* %gep2, align 16
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 0556cf78683e8..77fe9d3817c61 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -312,9 +312,10 @@ ret:
 
 ; GFX11-LABEL: tied_operand_test:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11:         scratch_load_d16_hi_b16 [[LDRESULT:v[0-9]+]], off, off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ds_store_b32 v{{[0-9]+}}, [[LDRESULT]] offset:8
+; GFX11-DAG:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off offset:4
+; GFX11-DAG:     v_mov_b32_e32 [[C:v[0-9]+]], 0x7b
+; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
+; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[C]]  offset:8
 ; GFX11-NEXT:    s_endpgm
 define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) {
 entry: