[llvm] 6fb0259 - [AMDGPU] Add support for architected flat scratch
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri May 14 10:53:56 PDT 2021
Author: Stanislav Mekhanoshin
Date: 2021-05-14T10:53:48-07:00
New Revision: 6fb02596a2094df90e9277981aae7a06a9b6671b
URL: https://github.com/llvm/llvm-project/commit/6fb02596a2094df90e9277981aae7a06a9b6671b
DIFF: https://github.com/llvm/llvm-project/commit/6fb02596a2094df90e9277981aae7a06a9b6671b.diff
LOG: [AMDGPU] Add support for architected flat scratch
Add support for the readonly flat Scratch register initialized
by the SPI.
Differential Revision: https://reviews.llvm.org/D102432
Added:
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
Removed:
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 64766ff804935..892dfa8b6a07e 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -3657,14 +3657,22 @@ The fields used by CP for code objects before V3 also match those specified in
``compute_pgm_rsrc2.user_sgpr.user_sgpr_count``.
Any requests beyond 16
will be ignored.
- >448 1 bit ENABLE_SGPR_PRIVATE_SEGMENT
- _BUFFER
+ >448 1 bit ENABLE_SGPR_PRIVATE_SEGMENT If the *Target Properties*
+ _BUFFER column of
+ :ref:`amdgpu-processor-table`
+ specifies *Architected flat
+ scratch* then not supported
+ and must be 0,
>449 1 bit ENABLE_SGPR_DISPATCH_PTR
>450 1 bit ENABLE_SGPR_QUEUE_PTR
>451 1 bit ENABLE_SGPR_KERNARG_SEGMENT_PTR
>452 1 bit ENABLE_SGPR_DISPATCH_ID
- >453 1 bit ENABLE_SGPR_FLAT_SCRATCH_INIT
-
+ >453 1 bit ENABLE_SGPR_FLAT_SCRATCH_INIT If the *Target Properties*
+ column of
+ :ref:`amdgpu-processor-table`
+ specifies *Architected flat
+ scratch* then not supported
+ and must be 0,
>454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT
_SIZE
457:455 3 bits Reserved, must be 0.
@@ -3984,14 +3992,27 @@ The fields used by CP for code objects before V3 also match those specified in
======= ======= =============================== ===========================================================================
Bits Size Field Name Description
======= ======= =============================== ===========================================================================
- 0 1 bit ENABLE_PRIVATE_SEGMENT Enable the setup of the
- private segment.
-
- In addition, enable the
- setup of the SGPR
- wavefront scratch offset
- system register (see
- :ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
+ 0 1 bit ENABLE_PRIVATE_SEGMENT * Enable the setup of the
+ private segment.
+ * If the *Target Properties*
+ column of
+ :ref:`amdgpu-processor-table`
+ does not specify
+ *Architected flat
+ scratch* then enable the
+ setup of the SGPR
+ wavefront scratch offset
+ system register (see
+ :ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
+ * If the *Target Properties*
+ column of
+ :ref:`amdgpu-processor-table`
+ specifies *Architected
+ flat scratch* then enable
+ the setup of the
+ FLAT_SCRATCH register
+ pair (see
+ :ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
Used by CP to set up
``COMPUTE_PGM_RSRC2.SCRATCH_EN``.
@@ -4550,12 +4571,26 @@ There are
diff erent methods used for initializing flat scratch:
segment address when using the Scratch Segment Buffer (see
:ref:`amdgpu-amdhsa-kernel-prolog-private-segment-buffer`).
+* If the *Target Properties* column of :ref:`amdgpu-processor-table`
+ specifies *Architected flat scratch*:
+
+ If ENABLE_PRIVATE_SEGMENT is enabled in
+ :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx10-table` then the FLAT_SCRATCH
+ register pair will be initialized to the 64-bit address of the base of scratch
+ backing memory being managed by SPI for the queue executing the kernel
+ dispatch plus the value of the wave's Scratch Wavefront Offset for use as the
+ flat scratch base in flat memory instructions.
+
.. _amdgpu-amdhsa-kernel-prolog-private-segment-buffer:
Private Segment Buffer
++++++++++++++++++++++
-Private Segment Buffer SGPR register is used to initialize 4 SGPRs
+If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies
+*Architected flat scratch* then a Private Segment Buffer is not supported.
+Instead the flat SCRATCH instructions are used.
+
+Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs
that are used as a V# to access scratch. CP uses the value provided by the
runtime. It is used, together with Scratch Wavefront Offset as an offset, to
access the private memory space using a segment address. See
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 5cd89d4fa4d1c..04e93055a9ac6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -731,6 +731,12 @@ def FeaturePackedTID : SubtargetFeature<"packed-tid",
"Workitem IDs are packed into v0 at kernel launch"
>;
+def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch",
+ "HasArchitectedFlatScratch",
+ "true",
+ "Flat Scratch register is a readonly SPI initialized architected register"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c30c1105aca22..d96b098d2a965 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -723,7 +723,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
const SIRegisterInfo &TRI = TII->getRegisterInfo();
Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
- MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
+ MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
+ MRI.isLiveIn(MFI->getPreloadedReg(
+ AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
// instructions aren't used to access the scratch buffer. Inline assembly may
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 0585e9e69a66f..0a1206c2d0fee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -289,6 +289,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FlatGlobalInsts(false),
FlatScratchInsts(false),
ScalarFlatScratchInsts(false),
+ HasArchitectedFlatScratch(false),
AddNoCarryInsts(false),
HasUnpackedD16VMem(false),
LDSMisalignedBug(false),
@@ -327,7 +328,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
}
bool GCNSubtarget::enableFlatScratch() const {
- return EnableFlatScratch && hasFlatScratchInsts();
+ return flatScratchIsArchitected() ||
+ (EnableFlatScratch && hasFlatScratchInsts());
}
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index d4a43370dcc2b..611733218412f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1373,6 +1373,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets];
}
+ bool hasArchitectedFlatScratch() const {
+ return getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+ }
+
bool hasSGPR102_SGPR103() const {
return !isVI() && !isGFX9();
}
@@ -4549,6 +4553,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return OutOfRangeError(ValRange);
KD.kernarg_size = Val;
} else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
Val, ValRange);
@@ -4579,6 +4587,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (Val)
UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
ValRange);
@@ -4598,10 +4610,20 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
- PARSE_BITS_ENTRY(
- KD.compute_pgm_rsrc2,
- COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val,
- ValRange);
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
+ } else if (ID == ".amdhsa_enable_private_segment") {
+ if (!hasArchitectedFlatScratch())
+ return Error(
+ IDRange.Start,
+ "directive is not supported without architected flat scratch",
+ IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
@@ -4639,6 +4661,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
} else if (ID == ".amdhsa_reserve_flat_scratch") {
if (IVersion.Major < 7)
return Error(IDRange.Start, "directive requires gfx7+", IDRange);
+ if (hasArchitectedFlatScratch())
+ return Error(IDRange.Start,
+ "directive is not supported with architected flat scratch",
+ IDRange);
if (!isUInt<1>(Val))
return OutOfRangeError(ValRange);
ReserveFlatScr = Val;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 773a541234149..a8c38a3f274f9 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1457,6 +1457,10 @@ bool AMDGPUDisassembler::isGFX10Plus() const {
return AMDGPU::isGFX10Plus(STI);
}
+bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
+ return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU specific symbol handling
//===----------------------------------------------------------------------===//
@@ -1516,7 +1520,8 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
- KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
+ if (!hasArchitectedFlatScratch())
+ KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
@@ -1567,9 +1572,12 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
using namespace amdhsa;
StringRef Indent = "\t";
- PRINT_DIRECTIVE(
- ".amdhsa_system_sgpr_private_segment_wavefront_offset",
- COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+ if (hasArchitectedFlatScratch())
+ PRINT_DIRECTIVE(".amdhsa_enable_private_segment",
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+ else
+ PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset",
+ COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
@@ -1710,8 +1718,9 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
using namespace amdhsa;
TwoByteBuffer = DE.getU16(Cursor);
- PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
- KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+ if (!hasArchitectedFlatScratch())
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
@@ -1720,8 +1729,9 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
- PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
- KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ if (!hasArchitectedFlatScratch())
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
+ KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 93e2f636bdabd..dc879ec5ad883 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -174,6 +174,8 @@ class AMDGPUDisassembler : public MCDisassembler {
bool isGFX9Plus() const;
bool isGFX10() const;
bool isGFX10Plus() const;
+
+ bool hasArchitectedFlatScratch() const;
};
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 31260e3794d75..b970a9ceae521 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -167,6 +167,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool FlatGlobalInsts;
bool FlatScratchInsts;
bool ScalarFlatScratchInsts;
+ bool HasArchitectedFlatScratch;
bool AddNoCarryInsts;
bool HasUnpackedD16VMem;
bool R600ALUInst;
@@ -985,6 +986,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
+ /// \returns true if the flat_scratch register is initialized by the HW.
+ /// In this case it is readonly.
+ bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
+
/// \returns true if the machine has merged shaders in which s0-s7 are
/// reserved by the hardware and user SGPRs start at s8
bool hasMergedShaders() const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index ac12e26f0b8cd..446069a1c8b80 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -315,9 +315,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
<< KD.private_segment_fixed_size << '\n';
OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n';
- PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
+ if (!hasArchitectedFlatScratch(STI))
+ PRINT_FIELD(
+ OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
@@ -330,9 +332,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
- PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
- kernel_code_properties,
- amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ if (!hasArchitectedFlatScratch(STI))
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
+ kernel_code_properties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
@@ -340,10 +343,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
- PRINT_FIELD(
- OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD,
- compute_pgm_rsrc2,
- amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
+ PRINT_FIELD(OS,
+ (hasArchitectedFlatScratch(STI)
+ ? ".amdhsa_enable_private_segment"
+ : ".amdhsa_system_sgpr_private_segment_wavefront_offset"),
+ KD, compute_pgm_rsrc2,
+ amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
@@ -372,7 +377,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
if (!ReserveVCC)
OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
- if (IVersion.Major >= 7 && !ReserveFlatScr)
+ if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI))
OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 9ad53beefa3fd..963f58f4c72f1 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -493,7 +493,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
}
- if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
+ if ((MFI->hasFlatScratchInit() || ScratchRsrcReg) &&
+ !ST.flatScratchIsArchitected()) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8579e8e73ee03..c9a79447147c7 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -124,13 +124,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (WorkItemIDZ)
WorkItemIDY = true;
- PrivateSegmentWaveByteOffset = true;
-
- // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
- if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
- (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
- ArgInfo.PrivateSegmentWaveByteOffset =
- ArgDescriptor::createRegister(AMDGPU::SGPR5);
+ if (!ST.flatScratchIsArchitected()) {
+ PrivateSegmentWaveByteOffset = true;
+
+ // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+ (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+ ArgInfo.PrivateSegmentWaveByteOffset =
+ ArgDescriptor::createRegister(AMDGPU::SGPR5);
+ }
}
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
@@ -162,7 +164,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
KernargSegmentPtr = true;
if (ST.hasFlatAddressSpace() && isEntryFunction() &&
- (isAmdHsaOrMesa || ST.enableFlatScratch())) {
+ (isAmdHsaOrMesa || ST.enableFlatScratch()) &&
+ !ST.flatScratchIsArchitected()) {
// TODO: This could be refined a lot. The attribute is a poor way of
// detecting calls or stack objects that may require it before argument
// lowering.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 63b7834061df4..661684c68a026 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1459,6 +1459,10 @@ bool isGFX90A(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
}
+bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+}
+
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 85c0b4f47992b..a03be8bf8156d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -740,6 +740,7 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI);
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
bool isGFX90A(const MCSubtargetInfo &STI);
+bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
index 455c19fcdfc23..81a94869eec6c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -1,16 +1,24 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,RW-FLAT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GCN,RO-FLAT %s
; Make sure flat_scratch_init is set
; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
-; GCN: s_add_u32 flat_scratch_lo, s4, s7
-; GCN: s_addc_u32 flat_scratch_hi, s5, 0
-; GCN: flat_store_dword
-; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; GCN-NOT: .amdhsa_reserve_flat_scratch
-; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
-; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7
+; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0
+; RO-FLAT-NOT: flat_scratch
+; GCN: flat_store_dword
+; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
+; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
+; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
+; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset
+; RW-FLAT-NOT: .amdhsa_enable_private_segment
+; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
+; RO-FLAT: .amdhsa_enable_private_segment 1
+; GCN-NOT: .amdhsa_reserve_flat_scratch
+; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
+; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
%alloca = alloca i32, addrspace(5)
%cast = addrspacecast i32 addrspace(5)* %alloca to i32*
@@ -20,15 +28,23 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
; TODO: Could optimize out in this case
; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls:
-; GCN: s_add_u32 flat_scratch_lo, s4, s7
-; GCN: s_addc_u32 flat_scratch_hi, s5, 0
-; GCN: buffer_store_dword
-; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
-; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
-; GCN-NOT: .amdhsa_reserve_flat_scratch
-; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
-; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7
+; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0
+; RO-FLAT-NOT: flat_scratch
+; RW-FLAT: buffer_store_dword
+; RO-FLAT: scratch_store_dword
+; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1
+; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
+; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
+; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
+; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; RW-FLAT-NOT: .amdhsa_enable_private_segment
+; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
+; RO-FLAT: .amdhsa_enable_private_segment 1
+; GCN-NOT: .amdhsa_reserve_flat_scratch
+; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
+; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
@@ -36,13 +52,20 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
}
; GCN-LABEL: {{^}}kernel_no_calls_no_stack:
-; GCN-NOT: flat_scratch
-; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
-; GCN: .amdhsa_user_sgpr_flat_scratch_init 0
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
-; GCN: .amdhsa_reserve_flat_scratch 0
-; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
-; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 4
+; GCN-NOT: flat_scratch
+; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1
+; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
+; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0
+; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
+; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+; RW-FLAT-NOT: .amdhsa_enable_private_segment
+; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
+; RO-FLAT: .amdhsa_enable_private_segment 0
+; RW-FLAT: .amdhsa_reserve_flat_scratch 0
+; RO-FLAT-NOT: .amdhsa_reserve_flat_scratch 0
+; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4
+; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @kernel_no_calls_no_stack() {
ret void
}
More information about the llvm-commits
mailing list