[llvm] r289261 - AMDGPU/SI: Don't reserve FLAT_SCR on non-HSA targets & without stack objects
Marek Olsak via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 9 11:49:49 PST 2016
Author: mareko
Date: Fri Dec 9 13:49:48 2016
New Revision: 289261
URL: http://llvm.org/viewvc/llvm-project?rev=289261&view=rev
Log:
AMDGPU/SI: Don't reserve FLAT_SCR on non-HSA targets & without stack objects
Summary: This frees 2 scalar registers.
Reviewers: tstellarAMD
Subscribers: qcolombet, arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tony-tye
Differential Revision: https://reviews.llvm.org/D27150
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
llvm/trunk/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
llvm/trunk/test/CodeGen/AMDGPU/flat-scratch-reg.ll
llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp?rev=289261&r1=289260&r2=289261&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp Fri Dec 9 13:49:48 2016
@@ -391,7 +391,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(
case AMDGPU::FLAT_SCR:
case AMDGPU::FLAT_SCR_LO:
case AMDGPU::FLAT_SCR_HI:
- FlatUsed = true;
+ // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
+ // instructions aren't used to access the scratch buffer.
+ if (MFI->hasFlatScratchInit())
+ FlatUsed = true;
continue;
case AMDGPU::TBA:
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=289261&r1=289260&r2=289261&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Fri Dec 9 13:49:48 2016
@@ -1178,11 +1178,19 @@ unsigned SIRegisterInfo::getNumAddressab
return 104;
}
-unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST) const {
+unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST,
+ const SIMachineFunctionInfo &MFI) const {
+ if (MFI.hasFlatScratchInit()) {
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return 6; // FLAT_SCRATCH, XNACK, VCC (in that order)
+
+ if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
+ return 4; // FLAT_SCRATCH, VCC (in that order)
+ }
+
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
- return 6; // VCC, FLAT_SCRATCH, XNACK.
- if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
- return 4; // VCC, FLAT_SCRATCH.
+ return 4; // XNACK, VCC (in that order)
+
return 2; // VCC.
}
@@ -1254,7 +1262,7 @@ unsigned SIRegisterInfo::getMaxNumSGPRs(
F, "amdgpu-num-sgpr", MaxNumSGPRs);
// Make sure requested value does not violate subtarget's specifications.
- if (Requested && (Requested <= getNumReservedSGPRs(ST)))
+ if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI)))
Requested = 0;
// If more SGPRs are required to support the input user/system SGPRs,
@@ -1283,7 +1291,8 @@ unsigned SIRegisterInfo::getMaxNumSGPRs(
if (ST.hasSGPRInitBug())
MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
- return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST), MaxNumAddressableSGPRs);
+ return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI),
+ MaxNumAddressableSGPRs);
}
unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs(
Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h?rev=289261&r1=289260&r2=289261&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h Fri Dec 9 13:49:48 2016
@@ -22,6 +22,7 @@ namespace llvm {
class SISubtarget;
class MachineRegisterInfo;
+class SIMachineFunctionInfo;
class SIRegisterInfo final : public AMDGPURegisterInfo {
private:
@@ -198,7 +199,8 @@ public:
unsigned getNumAddressableSGPRs(const SISubtarget &ST) const;
/// \returns Number of reserved SGPRs supported by the subtarget.
- unsigned getNumReservedSGPRs(const SISubtarget &ST) const;
+ unsigned getNumReservedSGPRs(const SISubtarget &ST,
+ const SIMachineFunctionInfo &MFI) const;
/// \returns Minimum number of SGPRs that meets given number of waves per
/// execution unit requirement for given subtarget.
Modified: llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll?rev=289261&r1=289260&r2=289261&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll Fri Dec 9 13:49:48 2016
@@ -34,9 +34,9 @@ entry:
attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
; CHECK-LABEL: {{^}}min_1024_max_2048
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 1
; CHECK: VGPRBlocks: 7
-; CHECK: NumSGPRsForWavesPerEU: 19
+; CHECK: NumSGPRsForWavesPerEU: 13
; CHECK: NumVGPRsForWavesPerEU: 32
@var = addrspace(1) global float 0.0
define void @min_1024_max_2048() #3 {
Modified: llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll?rev=289261&r1=289260&r2=289261&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll Fri Dec 9 13:49:48 2016
@@ -4,7 +4,7 @@
; If spilling to smem, additional registers are used for the resource
; descriptor.
-; ALL-LABEL: {{^}}max_14_sgprs:
+; ALL-LABEL: {{^}}max_12_sgprs:
; FIXME: Should be ablo to skip this copying of the private segment
; buffer because all the SGPR spills are to VGPRs.
@@ -12,8 +12,8 @@
; ALL: s_mov_b64 s[6:7], s[2:3]
; ALL: s_mov_b64 s[4:5], s[0:1]
; ALL: SGPRBlocks: 1
-; ALL: NumSGPRsForWavesPerEU: 14
-define void @max_14_sgprs(i32 addrspace(1)* %out1,
+; ALL: NumSGPRsForWavesPerEU: 12
+define void @max_12_sgprs(i32 addrspace(1)* %out1,
i32 addrspace(1)* %out2,
i32 addrspace(1)* %out3,
@@ -35,7 +35,7 @@ define void @max_14_sgprs(i32 addrspace(
; ---------------------
; total: 14
-; + reserved vcc, xnack, flat_scratch = 20
+; + reserved vcc = 16
; Because we can't handle re-using the last few input registers as the
; special vcc etc. registers (as well as decide to not use the unused
@@ -43,15 +43,15 @@ define void @max_14_sgprs(i32 addrspace(
; more than expected.
; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
-; TOSGPR: SGPRBlocks: 2
-; TOSGPR: NumSGPRsForWavesPerEU: 20
+; TOSGPR: SGPRBlocks: 1
+; TOSGPR: NumSGPRsForWavesPerEU: 16
; TOSMEM: s_mov_b64 s[6:7], s[2:3]
+; TOSMEM: s_mov_b32 s9, s13
; TOSMEM: s_mov_b64 s[4:5], s[0:1]
-; TOSMEM: s_mov_b32 s3, s13
-; TOSMEM: SGPRBlocks: 2
-; TOSMEM: NumSGPRsForWavesPerEU: 20
+; TOSMEM: SGPRBlocks: 1
+; TOSMEM: NumSGPRsForWavesPerEU: 16
define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
i32 addrspace(1)* %out2,
i32 addrspace(1)* %out3,
Modified: llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll?rev=289261&r1=289260&r2=289261&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll Fri Dec 9 13:49:48 2016
@@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="
; Exactly 10 waves per execution unit.
; CHECK-LABEL: {{^}}exactly_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 1
; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 19
+; CHECK: NumSGPRsForWavesPerEU: 13
; CHECK: NumVGPRsForWavesPerEU: 24
define void @exactly_10() #9 {
%val0 = load volatile float, float addrspace(1)* @var
Modified: llvm/trunk/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/exceed-max-sgprs.ll?rev=289261&r1=289260&r2=289261&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/exceed-max-sgprs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/exceed-max-sgprs.ll Fri Dec 9 13:49:48 2016
@@ -38,7 +38,7 @@ define void @use_too_many_sgprs_bonaire(
ret void
}
-; ERROR: error: scalar registers limit of 104 exceeded (108) in use_too_many_sgprs_bonaire_flat_scr
+; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr
define void @use_too_many_sgprs_bonaire_flat_scr() #1 {
call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
Modified: llvm/trunk/test/CodeGen/AMDGPU/flat-scratch-reg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/flat-scratch-reg.ll?rev=289261&r1=289260&r2=289261&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/flat-scratch-reg.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/flat-scratch-reg.ll Fri Dec 9 13:49:48 2016
@@ -1,18 +1,20 @@
-; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=NOXNACK -check-prefix=CI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=NOXNACK -check-prefix=VI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=XNACK -check-prefix=VI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=XNACK -check-prefix=VI -check-prefix=GCN %s
-
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=NOXNACK -check-prefix=HSA-NOXNACK -check-prefix=HSA -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=NOXNACK -check-prefix=HSA-NOXNACK -check-prefix=HSA -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=XNACK -check-prefix=HSA-XNACK -check-prefix=HSA -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s
+
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-XNACK -check-prefix=GCN %s
; GCN-LABEL: {{^}}no_vcc_no_flat:
-; HSA-NOXNACK: is_xnack_enabled = 0
-; HSA-XNACK: is_xnack_enabled = 1
-
-; NOXNACK: ; NumSgprs: 8
-; XNACK: ; NumSgprs: 12
+; HSA-CI: is_xnack_enabled = 0
+; HSA-VI-NOXNACK: is_xnack_enabled = 0
+; HSA-VI-XNACK: is_xnack_enabled = 1
+
+; CI: ; NumSgprs: 8
+; VI-NOXNACK: ; NumSgprs: 8
+; VI-XNACK: ; NumSgprs: 12
define void @no_vcc_no_flat() {
entry:
call void asm sideeffect "", "~{SGPR7}"()
@@ -20,11 +22,13 @@ entry:
}
; GCN-LABEL: {{^}}vcc_no_flat:
-; HSA-NOXNACK: is_xnack_enabled = 0
-; HSA-XNACK: is_xnack_enabled = 1
-
-; NOXNACK: ; NumSgprs: 10
-; XNACK: ; NumSgprs: 12
+; HSA-CI: is_xnack_enabled = 0
+; HSA-VI-NOXNACK: is_xnack_enabled = 0
+; HSA-VI-XNACK: is_xnack_enabled = 1
+
+; CI: ; NumSgprs: 10
+; VI-NOXNACK: ; NumSgprs: 10
+; VI-XNACK: ; NumSgprs: 12
define void @vcc_no_flat() {
entry:
call void asm sideeffect "", "~{SGPR7},~{VCC}"()
@@ -32,11 +36,16 @@ entry:
}
; GCN-LABEL: {{^}}no_vcc_flat:
-; HSA-NOXNACK: is_xnack_enabled = 0
-; HSA-XNACK: is_xnack_enabled = 1
-
-; CI: ; NumSgprs: 12
-; VI: ; NumSgprs: 14
+; HSA-CI: is_xnack_enabled = 0
+; HSA-VI-NOXNACK: is_xnack_enabled = 0
+; HSA-VI-XNACK: is_xnack_enabled = 1
+
+; CI: ; NumSgprs: 8
+; VI-NOXNACK: ; NumSgprs: 8
+; VI-XNACK: ; NumSgprs: 12
+; HSA-CI: ; NumSgprs: 8
+; HSA-VI-NOXNACK: ; NumSgprs: 8
+; HSA-VI-XNACK: ; NumSgprs: 12
define void @no_vcc_flat() {
entry:
call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
@@ -47,8 +56,12 @@ entry:
; HSA-NOXNACK: is_xnack_enabled = 0
; HSA-XNACK: is_xnack_enabled = 1
-; CI: ; NumSgprs: 12
-; VI: ; NumSgprs: 14
+; CI: ; NumSgprs: 10
+; VI-NOXNACK: ; NumSgprs: 10
+; VI-XNACK: ; NumSgprs: 12
+; HSA-CI: ; NumSgprs: 10
+; HSA-VI-NOXNACK: ; NumSgprs: 10
+; HSA-VI-XNACK: ; NumSgprs: 12
define void @vcc_flat() {
entry:
call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
Modified: llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll?rev=289261&r1=289260&r2=289261&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll Fri Dec 9 13:49:48 2016
@@ -7,7 +7,8 @@
; XXX - Why does it like to use vcc?
; GCN-LABEL: {{^}}spill_m0:
-; TOSMEM: s_mov_b32 s84, SCRATCH_RSRC_DWORD0
+; TOSMEM: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0
+; TOSMEM: s_mov_b32 s[[HI:[0-9]+]], 0xe80000
; GCN-DAG: s_cmp_lg_u32
@@ -22,7 +23,7 @@
; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
; TOSMEM: s_mov_b32 m0, s3{{$}}
; TOSMEM-NOT: [[M0_COPY]]
-; TOSMEM: s_buffer_store_dword [[M0_COPY]], s[84:87], m0 ; 4-byte Folded Spill
+; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill
; TOSMEM: s_waitcnt lgkmcnt(0)
; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
@@ -37,7 +38,7 @@
; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
; TOSMEM: s_mov_b32 m0, s3{{$}}
-; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s[84:87], m0 ; 4-byte Folded Reload
+; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload
; TOSMEM-NOT: [[M0_RESTORE]]
; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]]
@@ -161,10 +162,10 @@ endif:
; TOSMEM: s_cmp_eq_u32
; TOSMEM-NOT: m0
; TOSMEM: s_mov_b32 m0, s3
-; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[84:87], m0 ; 8-byte Folded Spill
+; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
; TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x200
-; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[84:87], m0 ; 4-byte Folded Spill
+; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
; TOSMEM-NOT: m0
; TOSMEM: s_cbranch_scc1
@@ -172,7 +173,7 @@ endif:
; TOSMEM: s_mov_b32 vcc_hi, m0
; TOSMEM: s_mov_b32 m0, s3
-; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[84:87], m0 ; 8-byte Folded Reload
+; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload
; TOSMEM: s_mov_b32 m0, vcc_hi
; TOSMEM: s_waitcnt lgkmcnt(0)
@@ -180,7 +181,7 @@ endif:
; TOSMEM-NOT: m0
; TOSMEM: s_add_u32 m0, s3, 0x200
-; TOSMEM: s_buffer_load_dword s0, s[84:87], m0 ; 4-byte Folded Reload
+; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload
; TOSMEM-NOT: m0
; TOSMEM: s_waitcnt lgkmcnt(0)
; TOSMEM-NOT: m0
More information about the llvm-commits
mailing list