[llvm] d2e74fa - AMDGPU: Set more mov flags on V_ACCVGPR_{READ|WRITE}_B32
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 1 15:59:31 PDT 2020
Author: Matt Arsenault
Date: 2020-07-01T18:58:59-04:00
New Revision: d2e74fad20bf8cf66ff20a43fe2934d71e046528
URL: https://github.com/llvm/llvm-project/commit/d2e74fad20bf8cf66ff20a43fe2934d71e046528
DIFF: https://github.com/llvm/llvm-project/commit/d2e74fad20bf8cf66ff20a43fe2934d71e046528.diff
LOG: AMDGPU: Set more mov flags on V_ACCVGPR_{READ|WRITE}_B32
This fixes extra copies when materializing constants in AGPRs. This
made it a lot harder to trigger the spilling in spill-agpr.ll
Added:
llvm/test/CodeGen/AMDGPU/agpr-remat.ll
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/test/CodeGen/AMDGPU/spill-agpr.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d7cd624484fc..9af8ffedce0f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -144,6 +144,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::V_ACCVGPR_READ_B32:
+ case AMDGPU::V_ACCVGPR_WRITE_B32:
// No implicit operands.
return MI.getNumOperands() == MI.getDesc().getNumOperands();
default:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 47238c692341..fc457ad212d4 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -366,10 +366,13 @@ def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, A
def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
let Predicates = [HasMAIInsts] in {
+
+let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>;
def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> {
let isMoveImm = 1;
}
+}
// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
new file mode 100644
index 000000000000..2ec1cee76662
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
+
+; Make sure there are no v_accvgpr_read_b32 copying back and forth
+; between AGPR and VGPR.
+define amdgpu_kernel void @remat_constant_voids_spill(i32 addrspace(1)* %p) #1 {
+; GFX908-LABEL: remat_constant_voids_spill:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: v_accvgpr_write_b32 a1, 1
+; GFX908-NEXT: v_accvgpr_write_b32 a5, 6
+; GFX908-NEXT: v_accvgpr_write_b32 a6, 7
+; GFX908-NEXT: v_accvgpr_write_b32 a7, 8
+; GFX908-NEXT: v_accvgpr_write_b32 a0, 9
+; GFX908-NEXT: v_accvgpr_write_b32 a2, 2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, 3
+; GFX908-NEXT: v_accvgpr_write_b32 a4, 4
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_write_b32 a1, 5
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_endpgm
+ call void asm sideeffect "", "a,a,a,a"(i32 1, i32 2, i32 3, i32 4)
+ call void asm sideeffect "", "a,a,a,a,a"(i32 5, i32 6, i32 7, i32 8, i32 9)
+ ret void
+}
+
+define void @remat_regcopy_avoids_spill(i32 %v0, i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10) #1 {
+; GFX908-LABEL: remat_regcopy_avoids_spill:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v3
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v7
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+ call void asm sideeffect "", "a,a,a,a"(i32 %v0, i32 %v1, i32 %v2, i32 %v3)
+ call void asm sideeffect "", "a,a,a,a,a"(i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8)
+ ret void
+}
+
+attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 3e60879de179..3e7b381a45fe 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -64,15 +64,26 @@ st:
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; A2V-NOT: SCRATCH_RSRC
-; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
-; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
-; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
-; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
-; A2V: ScratchSize: 0
-define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #1 {
- %tid = call i32 @llvm.amdgcn.workitem.id.x()
- call void asm sideeffect "", "a,a,a,a"(i32 1, i32 2, i32 3, i32 4)
- call void asm sideeffect "", "a,a,a,a,a"(i32 5, i32 6, i32 7, i32 8, i32 9)
+
+; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
+; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; A2V: ScratchSize: 0
+
+; A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
+; A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]]
+define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
+ %v0 = load volatile i32, i32 addrspace(3)* undef
+ %v1 = load volatile i32, i32 addrspace(3)* undef
+ %v2 = load volatile i32, i32 addrspace(3)* undef
+ %v3 = load volatile i32, i32 addrspace(3)* undef
+ %v4 = load volatile i32, i32 addrspace(3)* undef
+ %v5 = load volatile i32, i32 addrspace(3)* undef
+ %v6 = load volatile i32, i32 addrspace(3)* undef
+ %v7 = load volatile i32, i32 addrspace(3)* undef
+ call void asm sideeffect "", "a,a,a,a,~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6}"(i32 %v0, i32 %v1, i32 %v2, i32 %v3)
+ %v8 = load volatile i32, i32 addrspace(3)* undef
+ call void asm sideeffect "", "a,a,a,a,a"(i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8)
ret void
}
More information about the llvm-commits
mailing list