[llvm] b8e09f1 - [AMDGPU] Refactor LDS alignment checks.
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 12 07:49:53 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-04-12T07:49:40-07:00
New Revision: b8e09f15539a12b27429460cd9eaf38e83c2d634
URL: https://github.com/llvm/llvm-project/commit/b8e09f15539a12b27429460cd9eaf38e83c2d634
DIFF: https://github.com/llvm/llvm-project/commit/b8e09f15539a12b27429460cd9eaf38e83c2d634.diff
LOG: [AMDGPU] Refactor LDS alignment checks.
Move features/bugs checks into the single place
allowsMisalignedMemoryAccessesImpl.
This is mostly NFCI except for the order of selection in couple places.
A separate change may be needed to stop lying about Fast.
Differential Revision: https://reviews.llvm.org/D123343
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
llvm/test/CodeGen/AMDGPU/store-local.128.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e37448552a41c..8b25b16abe923 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1516,25 +1516,23 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
+ // Check if alignment requirements for ds_read/write instructions are
+ // disabled.
+ if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
+ return false;
+
Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
Alignment < RequiredAlignment)
return false;
- // Check if alignment requirements for ds_read/write instructions are
- // disabled.
- if (Subtarget->hasUnalignedDSAccessEnabled()) {
- if (IsFast)
- *IsFast = Alignment != Align(2);
- return true;
- }
-
// Either, the alignment requirements are "enabled", or there is an
// unaligned LDS access related hardware bug though alignment requirements
// are "disabled". In either case, we need to check for proper alignment
// requirements.
//
- if (Size == 64) {
+ switch (Size) {
+ case 64:
// SI has a hardware bug in the LDS / GDS bounds checking: if the base
// address is negative, then the instruction is incorrectly treated as
// out-of-bounds even if base + offsets is in bounds. Split vectorized
@@ -1546,31 +1544,41 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
// 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
// can do a 4 byte aligned, 8 byte access in a single operation using
// ds_read2/write2_b32 with adjacent offsets.
- bool AlignedBy4 = Alignment >= Align(4);
- if (IsFast)
- *IsFast = AlignedBy4;
+ RequiredAlignment = Align(4);
+ break;
+ case 96:
+ if (!Subtarget->hasDS96AndDS128())
+ return false;
- return AlignedBy4;
- }
- if (Size == 96) {
// 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
// gfx8 and older.
- bool AlignedBy16 = Alignment >= Align(16);
- if (IsFast)
- *IsFast = AlignedBy16;
+ RequiredAlignment = Align(16);
+ break;
+ case 128:
+ if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
+ return false;
- return AlignedBy16;
- }
- if (Size == 128) {
// 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
// gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
// single operation using ds_read2/write2_b64.
- bool AlignedBy8 = Alignment >= Align(8);
- if (IsFast)
- *IsFast = AlignedBy8;
+ RequiredAlignment = Align(8);
+ break;
+ default:
+ if (Size > 32)
+ return false;
- return AlignedBy8;
+ break;
}
+
+ if (IsFast) {
+ // FIXME: Lie it is fast if +unaligned-access-mode is passed so that
+ // DS accesses get vectorized.
+ *IsFast = Alignment >= RequiredAlignment ||
+ Subtarget->hasUnalignedDSAccessEnabled();
+ }
+
+ return Alignment >= RequiredAlignment ||
+ Subtarget->hasUnalignedDSAccessEnabled();
}
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1595,9 +1603,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
return AlignedBy4;
}
- if (Subtarget->hasUnalignedBufferAccessEnabled() &&
- !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
- AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
+ if (Subtarget->hasUnalignedBufferAccessEnabled()) {
// If we have a uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
@@ -8519,27 +8525,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- // Use ds_read_b128 or ds_read_b96 when possible.
- if (Subtarget->hasDS96AndDS128() &&
- ((Subtarget->useDS128() && MemVT.getStoreSize() == 16) ||
- MemVT.getStoreSize() == 12) &&
- allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
- Load->getAlign()))
+ bool Fast = false;
+ auto Flags = Load->getMemOperand()->getFlags();
+ if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
+ Load->getAlign(), Flags, &Fast) &&
+ Fast)
return SDValue();
- if (NumElements > 2)
+ if (MemVT.isVector())
return SplitVectorLoad(Op, DAG);
-
- // SI has a hardware bug in the LDS / GDS bounds checking: if the base
- // address is negative, then the instruction is incorrectly treated as
- // out-of-bounds even if base + offsets is in bounds. Split vectorized
- // loads here to avoid emitting ds_read2_b32. We may re-combine the
- // load later in the SILoadStoreOptimizer.
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
- NumElements == 2 && MemVT.getStoreSize() == 8 &&
- Load->getAlignment() < 8) {
- return SplitVectorLoad(Op, DAG);
- }
}
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
@@ -9030,36 +9024,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- // Use ds_write_b128 or ds_write_b96 when possible.
- if (Subtarget->hasDS96AndDS128() &&
- ((Subtarget->useDS128() && VT.getStoreSize() == 16) ||
- (VT.getStoreSize() == 12)) &&
- allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
- Store->getAlign()))
+ bool Fast = false;
+ auto Flags = Store->getMemOperand()->getFlags();
+ if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
+ Store->getAlign(), Flags, &Fast) &&
+ Fast)
return SDValue();
- if (NumElements > 2)
+ if (VT.isVector())
return SplitVectorStore(Op, DAG);
- // SI has a hardware bug in the LDS / GDS bounds checking: if the base
- // address is negative, then the instruction is incorrectly treated as
- // out-of-bounds even if base + offsets is in bounds. Split vectorized
- // stores here to avoid emitting ds_write2_b32. We may re-combine the
- // store later in the SILoadStoreOptimizer.
- if (!Subtarget->hasUsableDSOffset() &&
- NumElements == 2 && VT.getStoreSize() == 8 &&
- Store->getAlignment() < 8) {
- return SplitVectorStore(Op, DAG);
- }
-
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- VT, *Store->getMemOperand())) {
- if (VT.isVector())
- return SplitVectorStore(Op, DAG);
- return expandUnalignedStore(Store, DAG);
- }
-
- return SDValue();
+ return expandUnalignedStore(Store, DAG);
} else {
llvm_unreachable("unhandled address space");
}
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
index ee2f633f57dba..8ca037fd49c84 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
@@ -66,41 +66,40 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg
; CHECK-NEXT: s_mov_b32 s6, s4
; CHECK-NEXT: s_mov_b32 s5, s3
; CHECK-NEXT: s_mov_b32 s4, s2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, 20, v1
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 16, v1
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, 12, v1
; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, 12, v1
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, 4, v1
; CHECK-NEXT: v_mov_b32_e32 v9, s0
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v2
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, 12, v2
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, 20, v2
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, 16, v2
; CHECK-NEXT: s_mov_b32 m0, -1
-; CHECK-NEXT: ds_read_b32 v3, v1
-; CHECK-NEXT: ds_read_b32 v4, v0
+; CHECK-NEXT: ds_read_b32 v7, v3
+; CHECK-NEXT: ds_read_b32 v6, v4
; CHECK-NEXT: ds_read_b32 v5, v5
-; CHECK-NEXT: ds_read_b32 v6, v6
-; CHECK-NEXT: ds_read_b32 v0, v7
-; CHECK-NEXT: ds_read_b32 v1, v8
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v2
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2
-; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
+; CHECK-NEXT: ds_read_b32 v4, v8
+; CHECK-NEXT: ds_read_b32 v8, v0
+; CHECK-NEXT: ds_read_b32 v3, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, 12, v2
+; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
+; CHECK-NEXT: v_add_i32_e32 v13, vcc, 4, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
-; CHECK-NEXT: s_waitcnt expcnt(0)
-; CHECK-NEXT: ds_read_b32 v0, v2
+; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
+; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
+; CHECK-NEXT: ds_read_b32 v0, v11
+; CHECK-NEXT: s_waitcnt expcnt(1)
+; CHECK-NEXT: ds_read_b32 v5, v1
+; CHECK-NEXT: ds_read_b32 v4, v12
+; CHECK-NEXT: ds_read_b32 v3, v13
+; CHECK-NEXT: ds_read_b32 v2, v2
; CHECK-NEXT: ds_read_b32 v1, v10
-; CHECK-NEXT: ds_read_b32 v2, v11
-; CHECK-NEXT: ds_read_b32 v3, v12
-; CHECK-NEXT: ds_read_b32 v4, v7
-; CHECK-NEXT: ds_read_b32 v5, v8
; CHECK-NEXT: s_waitcnt lgkmcnt(5)
; CHECK-NEXT: exp mrt0 off, off, off, off
-; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
+; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
+; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
; CHECK-NEXT: s_endpgm
%load1 = load <6 x float>, <6 x float> addrspace(3)* %arg5, align 4
%vec11 = shufflevector <6 x float> %load1, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
index 50905ea44eb65..bf2fae1b773b7 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
@@ -462,12 +462,12 @@ define amdgpu_kernel void @store_lds_v4i32_align8(<4 x i32> addrspace(3)* %out,
; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX6-NEXT: s_mov_b32 m0, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, s7
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v4, s0
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
+; GFX6-NEXT: v_mov_b32_e32 v3, s7
+; GFX6-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GFX6-NEXT: s_endpgm
;
; GFX10-LABEL: store_lds_v4i32_align8:
More information about the llvm-commits
mailing list