[llvm] ac64995 - [AMDGPU] Only use ds_read/write_b128 for alignment >= 16
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 7 19:42:54 PDT 2021
Author: hsmahesha
Date: 2021-04-08T08:12:05+05:30
New Revision: ac64995ceb4016fe89305f7a24a79c94bdfd249d
URL: https://github.com/llvm/llvm-project/commit/ac64995ceb4016fe89305f7a24a79c94bdfd249d
DIFF: https://github.com/llvm/llvm-project/commit/ac64995ceb4016fe89305f7a24a79c94bdfd249d.diff
LOG: [AMDGPU] Only use ds_read/write_b128 for alignment >= 16
PS: Submitting on behalf of Jay.
Reviewed By: rampitec
Differential Revision: https://reviews.llvm.org/D100008
Added:
Modified:
llvm/lib/Target/AMDGPU/DSInstructions.td
llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
llvm/test/CodeGen/AMDGPU/ds-alignment.ll
llvm/test/CodeGen/AMDGPU/ds_read2.ll
llvm/test/CodeGen/AMDGPU/ds_write2.ll
llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index d07ab15f664d..e32c36e9a18d 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -713,6 +713,8 @@ defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;
defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
+// Prefer ds_read over ds_read2, all other things being equal, because it has
+// a larger immediate offset range.
let AddedComplexity = 100 in {
foreach vt = VReg_64.RegTypes in {
@@ -725,6 +727,9 @@ foreach vt = VReg_96.RegTypes in {
defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
}
+// For performance reasons restrict this to alignment >= 16 even with
+// unaligned-access-mode. At lower alignments ds_read2_b64 is always a better
+// choice.
foreach vt = VReg_128.RegTypes in {
defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
}
@@ -735,10 +740,6 @@ foreach vt = VReg_96.RegTypes in {
defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
}
-foreach vt = VReg_128.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B128, vt, "load_local">;
-}
-
} // End SubtargetPredicate = HasUnalignedAccessMode
} // End SubtargetPredicate = isGFX7Plus
@@ -868,6 +869,8 @@ foreach vt = VReg_128.RegTypes in {
defm : DS128Bit8ByteAlignedPat_mc<vt>;
}
+// Prefer ds_write over ds_write2, all other things being equal, because it has
+// a larger immediate offset range.
let AddedComplexity = 100 in {
foreach vt = VReg_64.RegTypes in {
@@ -880,6 +883,9 @@ foreach vt = VReg_96.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_align16_local">;
}
+// For performance reasons restrict this to alignment >= 16 even with
+// unaligned-access-mode. At lower alignments ds_write2_b64 is always a better
+// choice.
foreach vt = VReg_128.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
}
@@ -890,10 +896,6 @@ foreach vt = VReg_96.RegTypes in {
defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
}
-foreach vt = VReg_128.RegTypes in {
-defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_local">;
-}
-
} // End SubtargetPredicate = HasUnalignedAccessMode
} // End SubtargetPredicate = isGFX7Plus
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
index bb0c283202e0..efb8ff4ec382 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll
@@ -26,8 +26,8 @@ bb:
; ALIGNED-DAG: ds_read2_b32
; ALIGNED-DAG: ds_write2_b32
; ALIGNED-DAG: ds_write2_b32
-; UNALIGNED-DAG: ds_read_b128
-; UNALIGNED-DAG: ds_write_b128
+; UNALIGNED-DAG: ds_read2_b64
+; UNALIGNED-DAG: ds_write2_b64
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -108,8 +108,8 @@ bb:
; GCN-LABEL: test_local_v4_aligned8:
; ALIGNED-DAG: ds_read2_b64
; ALIGNED-DAG: ds_write2_b64
-; UNALIGNED-DAG: ds_read_b128
-; UNALIGNED-DAG: ds_write_b128
+; UNALIGNED-DAG: ds_read2_b64
+; UNALIGNED-DAG: ds_write2_b64
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index 1c30dc547bd6..73e1da080f19 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -11,7 +11,7 @@ define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
; GFX9-LABEL: load_lds_v4i32_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_read_b128 v[0:3], v0
+; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -293,7 +293,7 @@ define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x)
; GFX9-LABEL: store_lds_v4i32_align1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: ds_write_b128 v0, v[1:4]
+; GFX9-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index ac4be6901938..e3b00b3da878 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -698,10 +698,10 @@ define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> add
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
+; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
+; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; UNALIGNED-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1
@@ -772,10 +772,10 @@ define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> add
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
+; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
+; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; UNALIGNED-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2
@@ -815,10 +815,10 @@ define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> add
; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
+; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
+; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; UNALIGNED-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4
@@ -826,27 +826,16 @@ define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> add
}
define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
-; ALIGNED-LABEL: ds16align8:
-; ALIGNED: ; %bb.0:
-; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; ALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
-; ALIGNED-NEXT: v_mov_b32_e32 v4, s1
-; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
-; ALIGNED-NEXT: s_endpgm
-;
-; UNALIGNED-LABEL: ds16align8:
-; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
-; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
-; UNALIGNED-NEXT: s_endpgm
+; GCN-LABEL: ds16align8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GCN-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8
store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index db34168797a2..c82d0421d3d4 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1035,7 +1035,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4
+; GFX9-UNALIGNED-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 7ed0aca2c678..674837a33d3c 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -837,11 +837,16 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
;
; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets:
; GFX9-UNALIGNED: ; %bb.0:
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3]
+; GFX9-UNALIGNED-NEXT: s_movk_i32 s0, 0x7b
+; GFX9-UNALIGNED-NEXT: s_mov_b32 s1, 0
+; GFX9-UNALIGNED-NEXT: s_mov_b32 s2, s0
+; GFX9-UNALIGNED-NEXT: s_mov_b32 s3, s1
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GFX9-UNALIGNED-NEXT: s_endpgm
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
@@ -1000,10 +1005,10 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrs
; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
+; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GFX9-UNALIGNED-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
index 6aba2b5bf2b7..b69765750ee7 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll
@@ -26,8 +26,8 @@ bb:
; ALIGNED-DAG: ds_read2_b32
; ALIGNED-DAG: ds_write2_b32
; ALIGNED-DAG: ds_write2_b32
-; UNALIGNED-DAG: ds_read_b128
-; UNALIGNED-DAG: ds_write_b128
+; UNALIGNED-DAG: ds_read2_b64
+; UNALIGNED-DAG: ds_write2_b64
define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -223,8 +223,8 @@ bb:
; GCN-LABEL: test_local_v4_aligned8:
; ALIGNED-DAG: ds_read2_b64
; ALIGNED-DAG: ds_write2_b64
-; UNALIGNED-DAG: ds_read_b128
-; UNALIGNED-DAG: ds_write_b128
+; UNALIGNED-DAG: ds_read2_b64
+; UNALIGNED-DAG: ds_write2_b64
define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list