[llvm] 9af4a85 - AMDGPU: Add test which shows unnecessary register alignment (#158168)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 12 18:11:03 PDT 2025
Author: Matt Arsenault
Date: 2025-09-13T10:10:59+09:00
New Revision: 9af4a854602804430dc04766ce1be311259707d6
URL: https://github.com/llvm/llvm-project/commit/9af4a854602804430dc04766ce1be311259707d6
DIFF: https://github.com/llvm/llvm-project/commit/9af4a854602804430dc04766ce1be311259707d6.diff
LOG: AMDGPU: Add test which shows unnecessary register alignment (#158168)
The b96 tr loads are a special case that does not require even
aligned VGPRs
Added:
Modified:
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index f504f2caa8632..3e96dfe40f745 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -158,3 +158,69 @@ entry:
store <4 x bfloat> %val, ptr addrspace(1) %use
ret void
}
+
+; This is a special case that does not require aligned VGPRs. Make
+; sure no copies are required for the unaligned ABI return value.
+define { i32, <3 x i32> } @ds_read_b96_tr_b6_no_align2_requirement(ptr addrspace(3) %ptr) {
+; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[2:4], v0 offset:32
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v4
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[2:4], v0 offset:32
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v4
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+ %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+ %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+ %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+ ret { i32, <3 x i32> } %insert1
+}
+
+define void @ds_read_b96_tr_b6_no_align2_requirement_agpr(ptr addrspace(3) %ptr) {
+; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, v1
+; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX950-SDAG-NEXT: ;;#ASMSTART
+; GFX950-SDAG-NEXT: ; use a1 a2 a3
+; GFX950-SDAG-NEXT: ;;#ASMEND
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, v1
+; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX950-GISEL-NEXT: ;;#ASMSTART
+; GFX950-GISEL-NEXT: ; use a1 a2 a3
+; GFX950-GISEL-NEXT: ;;#ASMEND
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+ %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+ %val0 = extractelement <3 x i32> %val, i32 0
+ %val1 = extractelement <3 x i32> %val, i32 1
+ %val2 = extractelement <3 x i32> %val, i32 2
+ call void asm sideeffect "; use $0 $1 $2", "{a1},{a2},{a3}"(i32 %val0, i32 %val1, i32 %val2)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
index d91b03ca4461d..d9f2fc55709a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
@@ -320,3 +320,57 @@ entry:
store <8 x bfloat> %val, ptr addrspace(1) %use
ret void
}
+
+; This is a special case that does not require aligned VGPRs. Make
+; sure no copies are required for the unaligned ABI return value.
+define { i32, <3 x i32> } @global_load_tr6_b96_vaddr_no_align2_requirement(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr6_b96_vaddr_no_align2_requirement:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v[0:1], off offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
+ %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+ %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+ ret { i32, <3 x i32> } %insert1
+}
+
+define { i32, <3 x i32> } @global_load_tr6_b96_saddr_no_align2_requirement(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr6_b96_saddr_no_align2_requirement:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v0, s[0:1] offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
+ %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+ %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+ ret { i32, <3 x i32> } %insert1
+}
+
+define { i32, <3 x i32> } @ds_load_tr6_b96_no_align2_requirement(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: ds_load_tr6_b96_no_align2_requirement:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: ds_load_tr6_b96 v[2:4], v0 offset:32
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v2
+; GFX1250-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+ %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
+ %insert1 = insertvalue { i32, <3 x i32> } %insert0, <3 x i32> %val, 1
+ ret { i32, <3 x i32> } %insert1
+}
More information about the llvm-commits
mailing list