[llvm] 28eb9ed - [AMDGPU] Fine tune LDS misaligned access speed
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 28 16:12:18 PST 2022
Author: Stanislav Mekhanoshin
Date: 2022-11-28T16:12:02-08:00
New Revision: 28eb9ed3bb5b73e1e395ab0d86f58fe3a2e9335a
URL: https://github.com/llvm/llvm-project/commit/28eb9ed3bb5b73e1e395ab0d86f58fe3a2e9335a
DIFF: https://github.com/llvm/llvm-project/commit/28eb9ed3bb5b73e1e395ab0d86f58fe3a2e9335a.diff
LOG: [AMDGPU] Fine tune LDS misaligned access speed
Differential Revision: https://reviews.llvm.org/D124219
Added:
llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74fd55b165285..198dee022532c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1423,8 +1423,21 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
// We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
// ds_write2_b32 depending on the alignment. In either case with either
// alignment there is no faster way of doing this.
+
+ // The numbers returned here and below are not additive, it is a 'speed
+ // rank'. They are just meant to be compared to decide if a certain way
+ // of lowering an operation is faster than another. For that purpose
+ // naturally aligned operation gets it bitsize to indicate that "it
+ // operates with a speed comparable to N-bit wide load". With the full
+ // alignment ds128 is slower than ds96 for example. If underaligned it
+ // is comparable to a speed of a single dword access, which would then
+ // mean 32 < 128 and it is faster to issue a wide load regardless.
+ // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
+ // wider load which will not be aligned anymore the latter is slower.
if (IsFast)
- *IsFast = 1;
+ *IsFast = (Alignment >= RequiredAlignment) ? 64
+ : (Alignment < Align(4)) ? 32
+ : 1;
return true;
}
@@ -1442,8 +1455,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
// be equally slow as a single ds_read_b96/ds_write_b96, but there will
// be more of them, so overall we will pay less penalty issuing a single
// instruction.
+
+ // See comment on the values above.
if (IsFast)
- *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
+ *IsFast = (Alignment >= RequiredAlignment) ? 96
+ : (Alignment < Align(4)) ? 32
+ : 1;
return true;
}
@@ -1463,8 +1480,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
// be equally slow as a single ds_read_b128/ds_write_b128, but there
// will be more of them, so overall we will pay less penalty issuing a
// single instruction.
+
+ // See comment on the values above.
if (IsFast)
- *IsFast= Alignment >= RequiredAlignment || Alignment < Align(4);
+ *IsFast = (Alignment >= RequiredAlignment) ? 128
+ : (Alignment < Align(4)) ? 32
+ : 1;
return true;
}
@@ -1476,8 +1497,11 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
break;
}
+ // See comment on the values above.
+ // Note that we have a single-dword or sub-dword here, so if underaligned
+ // it is a slowest possible access, hence returned value is 0.
if (IsFast)
- *IsFast = Alignment >= RequiredAlignment;
+ *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
return Alignment >= RequiredAlignment ||
Subtarget->hasUnalignedDSAccessEnabled();
@@ -1535,22 +1559,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
bool SITargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
unsigned *IsFast) const {
- bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
- Alignment, Flags, IsFast);
-
- if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
- (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
- // Lie it is fast if +unaligned-access-mode is passed so that DS accesses
- // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a
- // misaligned data which is faster than a pair of ds_read_b*/ds_write_b*
- // which would be equally misaligned.
- // This is only used by the common passes, selection always calls the
- // allowsMisalignedMemoryAccessesImpl version.
- *IsFast= 1;
- }
-
- return Allow;
+ return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
+ Alignment, Flags, IsFast);
}
EVT SITargetLowering::getOptimalMemOpType(
@@ -8785,7 +8795,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
auto Flags = Load->getMemOperand()->getFlags();
if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
Load->getAlign(), Flags, &Fast) &&
- Fast)
+ Fast > 1)
return SDValue();
if (MemVT.isVector())
@@ -9284,7 +9294,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
auto Flags = Store->getMemOperand()->getFlags();
if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
Store->getAlign(), Flags, &Fast) &&
- Fast)
+ Fast > 1)
return SDValue();
if (VT.isVector())
diff --git a/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
new file mode 100644
index 0000000000000..fc75ef69be032
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll
@@ -0,0 +1,107 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefix=GCN %s
+
+; Check that vectorizer does not create slow misaligned loads
+
+; GCN-LABEL: {{^}}ds1align1:
+; GCN-COUNT-2: ds_read_u8
+; GCN-COUNT-2: ds_write_b8
+define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
+ %val1 = load i8, i8 addrspace(3)* %in, align 1
+ %gep1 = getelementptr i8, i8 addrspace(3)* %in, i32 1
+ %val2 = load i8, i8 addrspace(3)* %gep1, align 1
+ store i8 %val1, i8 addrspace(3)* %out, align 1
+ %gep2 = getelementptr i8, i8 addrspace(3)* %out, i32 1
+ store i8 %val2, i8 addrspace(3)* %gep2, align 1
+ ret void
+}
+
+; GCN-LABEL: {{^}}ds2align2:
+; GCN-COUNT-2: ds_read_u16
+; GCN-COUNT-2: ds_write_b16
+define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
+ %val1 = load i16, i16 addrspace(3)* %in, align 2
+ %gep1 = getelementptr i16, i16 addrspace(3)* %in, i32 1
+ %val2 = load i16, i16 addrspace(3)* %gep1, align 2
+ store i16 %val1, i16 addrspace(3)* %out, align 2
+ %gep2 = getelementptr i16, i16 addrspace(3)* %out, i32 1
+ store i16 %val2, i16 addrspace(3)* %gep2, align 2
+ ret void
+}
+
+; GCN-LABEL: {{^}}ds4align4:
+; GCN: ds_read2_b32
+; GCN: ds_write2_b32
+define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+ %val1 = load i32, i32 addrspace(3)* %in, align 4
+ %gep1 = getelementptr i32, i32 addrspace(3)* %in, i32 1
+ %val2 = load i32, i32 addrspace(3)* %gep1, align 4
+ store i32 %val1, i32 addrspace(3)* %out, align 4
+ %gep2 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+ store i32 %val2, i32 addrspace(3)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}ds8align8:
+; GCN: ds_read2_b64
+; GCN: ds_write2_b64
+define amdgpu_kernel void @ds8align8(i64 addrspace(3)* %in, i64 addrspace(3)* %out) {
+ %val1 = load i64, i64 addrspace(3)* %in, align 8
+ %gep1 = getelementptr i64, i64 addrspace(3)* %in, i64 1
+ %val2 = load i64, i64 addrspace(3)* %gep1, align 8
+ store i64 %val1, i64 addrspace(3)* %out, align 8
+ %gep2 = getelementptr i64, i64 addrspace(3)* %out, i64 1
+ store i64 %val2, i64 addrspace(3)* %gep2, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}ds1align2:
+; GCN: ds_read_u16
+; GCN: ds_write_b16
+define amdgpu_kernel void @ds1align2(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
+ %val1 = load i8, i8 addrspace(3)* %in, align 2
+ %gep1 = getelementptr i8, i8 addrspace(3)* %in, i32 1
+ %val2 = load i8, i8 addrspace(3)* %gep1, align 2
+ store i8 %val1, i8 addrspace(3)* %out, align 2
+ %gep2 = getelementptr i8, i8 addrspace(3)* %out, i32 1
+ store i8 %val2, i8 addrspace(3)* %gep2, align 2
+ ret void
+}
+
+; GCN-LABEL: {{^}}ds2align4:
+; GCN: ds_read_b32
+; GCN: ds_write_b32
+define amdgpu_kernel void @ds2align4(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
+ %val1 = load i16, i16 addrspace(3)* %in, align 4
+ %gep1 = getelementptr i16, i16 addrspace(3)* %in, i32 1
+ %val2 = load i16, i16 addrspace(3)* %gep1, align 4
+ store i16 %val1, i16 addrspace(3)* %out, align 4
+ %gep2 = getelementptr i16, i16 addrspace(3)* %out, i32 1
+ store i16 %val2, i16 addrspace(3)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}ds4align8:
+; GCN: ds_read_b64
+; GCN: ds_write_b64
+define amdgpu_kernel void @ds4align8(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+ %val1 = load i32, i32 addrspace(3)* %in, align 8
+ %gep1 = getelementptr i32, i32 addrspace(3)* %in, i32 1
+ %val2 = load i32, i32 addrspace(3)* %gep1, align 8
+ store i32 %val1, i32 addrspace(3)* %out, align 8
+ %gep2 = getelementptr i32, i32 addrspace(3)* %out, i32 1
+ store i32 %val2, i32 addrspace(3)* %gep2, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}ds8align16:
+; GCN: ds_read_b128
+; GCN: ds_write_b128
+define amdgpu_kernel void @ds8align16(i64 addrspace(3)* %in, i64 addrspace(3)* %out) {
+ %val1 = load i64, i64 addrspace(3)* %in, align 16
+ %gep1 = getelementptr i64, i64 addrspace(3)* %in, i64 1
+ %val2 = load i64, i64 addrspace(3)* %gep1, align 16
+ store i64 %val1, i64 addrspace(3)* %out, align 16
+ %gep2 = getelementptr i64, i64 addrspace(3)* %out, i64 1
+ store i64 %val2, i64 addrspace(3)* %gep2, align 16
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 0556cf78683e8..77fe9d3817c61 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -312,9 +312,10 @@ ret:
; GFX11-LABEL: tied_operand_test:
; GFX11: ; %bb.0: ; %entry
-; GFX11: scratch_load_d16_hi_b16 [[LDRESULT:v[0-9]+]], off, off offset:4
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: ds_store_b32 v{{[0-9]+}}, [[LDRESULT]] offset:8
+; GFX11-DAG: scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off offset:4
+; GFX11-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7b
+; GFX11-DAG: ds_store_b16 v{{[0-9]+}}, [[LDRESULT]] offset:10
+; GFX11-DAG: ds_store_b16 v{{[0-9]+}}, [[C]] offset:8
; GFX11-NEXT: s_endpgm
define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) {
entry:
More information about the llvm-commits
mailing list