[lld] 868abf0 - Revert "[AMDGPU] Remove Code Object V3 (#67118)"
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 18 03:57:42 PDT 2023
Author: pvanhout
Date: 2023-10-18T12:55:36+02:00
New Revision: 868abf09619cb10ba710162483e5a66b0c1e4446
URL: https://github.com/llvm/llvm-project/commit/868abf09619cb10ba710162483e5a66b0c1e4446
DIFF: https://github.com/llvm/llvm-project/commit/868abf09619cb10ba710162483e5a66b0c1e4446.diff
LOG: Revert "[AMDGPU] Remove Code Object V3 (#67118)"
This reverts commit 544d91280c26fd5f7acd70eac4d667863562f4cc.
Added:
llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
llvm/test/MC/AMDGPU/hsa-diag-v3.s
llvm/test/MC/AMDGPU/hsa-gfx10-v3.s
llvm/test/MC/AMDGPU/hsa-gfx11-v3.s
llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
llvm/test/MC/AMDGPU/hsa-gfx940-v3.s
llvm/test/MC/AMDGPU/hsa-v3.s
Modified:
clang/include/clang/Basic/TargetOptions.h
clang/include/clang/Driver/Options.td
clang/lib/Driver/ToolChains/CommonArgs.cpp
clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
clang/test/Driver/hip-code-object-version.hip
clang/test/Driver/hip-device-libs.hip
lld/test/ELF/amdgpu-abi-version.s
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
llvm/test/CodeGen/AMDGPU/kernarg-size.ll
llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
llvm/test/CodeGen/AMDGPU/trap-abis.ll
llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
llvm/test/MC/AMDGPU/user-sgpr-count.s
Removed:
llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
llvm/test/MC/AMDGPU/hsa-diag-v4.s
################################################################################
diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h
index ba3acd029587160..8bb03249b7f8308 100644
--- a/clang/include/clang/Basic/TargetOptions.h
+++ b/clang/include/clang/Basic/TargetOptions.h
@@ -83,7 +83,7 @@ class TargetOptions {
enum CodeObjectVersionKind {
COV_None,
COV_2 = 200, // Unsupported.
- COV_3 = 300, // Unsupported.
+ COV_3 = 300,
COV_4 = 400,
COV_5 = 500,
};
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a89d6b6579f1176..640044622fc09ee 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4682,9 +4682,9 @@ defm amdgpu_ieee : BoolOption<"m", "amdgpu-ieee",
def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>,
HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">,
Visibility<[ClangOption, CC1Option]>,
- Values<"none,4,5">,
+ Values<"none,3,4,5">,
NormalizedValuesScope<"TargetOptions">,
- NormalizedValues<["COV_None", "COV_4", "COV_5"]>,
+ NormalizedValues<["COV_None", "COV_3", "COV_4", "COV_5"]>,
MarshallingInfoEnum<TargetOpts<"CodeObjectVersion">, "COV_4">;
defm cumode : SimpleMFlag<"cumode",
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index f104ec5a881cb96..25fd940584624ee 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2338,7 +2338,7 @@ getAMDGPUCodeObjectArgument(const Driver &D, const llvm::opt::ArgList &Args) {
void tools::checkAMDGPUCodeObjectVersion(const Driver &D,
const llvm::opt::ArgList &Args) {
- const unsigned MinCodeObjVer = 4;
+ const unsigned MinCodeObjVer = 3;
const unsigned MaxCodeObjVer = 5;
if (auto *CodeObjArg = getAMDGPUCodeObjectArgument(D, Args)) {
diff --git a/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu b/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
index ff5deaf9ab850d2..0ddd63faf46f28f 100644
--- a/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
@@ -3,6 +3,9 @@
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
// RUN: -o - %s | FileCheck %s -check-prefix=V4
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN: -mcode-object-version=3 -o - %s | FileCheck -check-prefix=V3 %s
+
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
// RUN: -mcode-object-version=4 -o - %s | FileCheck -check-prefix=V4 %s
@@ -15,6 +18,7 @@
// RUN: not %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
// RUN: -mcode-object-version=4.1 -o - %s 2>&1| FileCheck %s -check-prefix=INV
+// V3: !{{.*}} = !{i32 1, !"amdgpu_code_object_version", i32 300}
// V4: !{{.*}} = !{i32 1, !"amdgpu_code_object_version", i32 400}
// V5: !{{.*}} = !{i32 1, !"amdgpu_code_object_version", i32 500}
// NONE-NOT: !{{.*}} = !{i32 1, !"amdgpu_code_object_version",
diff --git a/clang/test/Driver/hip-code-object-version.hip b/clang/test/Driver/hip-code-object-version.hip
index af5f9a3da21dfd3..33559b6576e7d30 100644
--- a/clang/test/Driver/hip-code-object-version.hip
+++ b/clang/test/Driver/hip-code-object-version.hip
@@ -1,5 +1,20 @@
// REQUIRES: amdgpu-registered-target
+// Check bundle ID for code object v3.
+
+// RUN: not %clang -### --target=x86_64-linux-gnu \
+// RUN: -mcode-object-version=3 \
+// RUN: --offload-arch=gfx906 --rocm-path=%S/Inputs/rocm \
+// RUN: %s 2>&1 | FileCheck -check-prefix=V3 %s
+
+// RUN: not %clang -### --target=x86_64-linux-gnu \
+// RUN: -mcode-object-version=4 -mcode-object-version=3 \
+// RUN: --offload-arch=gfx906 --rocm-path=%S/Inputs/rocm \
+// RUN: %s 2>&1 | FileCheck -check-prefix=V3 %s
+
+// V3: "-mcode-object-version=3"
+// V3: "-mllvm" "--amdhsa-code-object-version=3"
+// V3: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906"
// Check bundle ID for code object version 4.
@@ -47,13 +62,6 @@
// INVALID_2: error: invalid integral value '2' in '-mcode-object-version=2'
// INVALID_2-NOT: error: invalid integral value
-// RUN: not %clang -### --target=x86_64-linux-gnu \
-// RUN: -mcode-object-version=3 \
-// RUN: --offload-arch=gfx906 --rocm-path=%S/Inputs/rocm \
-// RUN: %s 2>&1 | FileCheck -check-prefix=INVALID_3 %s
-// INVALID_3: error: invalid integral value '3' in '-mcode-object-version=3'
-// INVALID_3-NOT: error: invalid integral value
-
// Check LLVM code object version option --amdhsa-code-object-version
// is passed to -cc1 and -cc1as, and -mcode-object-version is passed
// to -cc1 but not -cc1as.
diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip
index 6ac5778721ba5b7..71d9554da696b42 100644
--- a/clang/test/Driver/hip-device-libs.hip
+++ b/clang/test/Driver/hip-device-libs.hip
@@ -168,6 +168,12 @@
// RUN: --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \
// RUN: 2>&1 | FileCheck %s --check-prefixes=NOABI4
+// Test -mcode-object-version=3
+// RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \
+// RUN: -mcode-object-version=3 \
+// RUN: --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ABI4
+
// Test -mcode-object-version=4
// RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \
// RUN: -mcode-object-version=4 \
diff --git a/lld/test/ELF/amdgpu-abi-version.s b/lld/test/ELF/amdgpu-abi-version.s
index cda9f5aafa5ee2f..455a52aec921092 100644
--- a/lld/test/ELF/amdgpu-abi-version.s
+++ b/lld/test/ELF/amdgpu-abi-version.s
@@ -1,4 +1,11 @@
# REQUIRES: amdgpu
+# RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -filetype=obj %s -o %t.o
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readobj --file-headers %t.so | FileCheck --check-prefix=COV3 %s
+
+# COV3: OS/ABI: AMDGPU_HSA (0x40)
+# COV3: ABIVersion: 1
+
# RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 -filetype=obj %s -o %t.o
# RUN: ld.lld -shared %t.o -o %t.so
# RUN: llvm-readobj --file-headers %t.so | FileCheck --check-prefix=COV4 %s
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index ed9581ccc93dfac..8022816d7e616d3 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1409,10 +1409,12 @@ The AMDGPU backend uses the following ELF header:
object conforms:
* ``ELFABIVERSION_AMDGPU_HSA_V2`` is used to specify the version of AMD HSA
- runtime ABI for code object V2. Can no longer be emitted by this version of LLVM.
+ runtime ABI for code object V2. Specify using the Clang option
+ ``-mcode-object-version=2``.
* ``ELFABIVERSION_AMDGPU_HSA_V3`` is used to specify the version of AMD HSA
- runtime ABI for code object V3. Can no longer be emitted by this version of LLVM.
+ runtime ABI for code object V3. Specify using the Clang option
+ ``-mcode-object-version=3``.
* ``ELFABIVERSION_AMDGPU_HSA_V4`` is used to specify the version of AMD HSA
runtime ABI for code object V4. Specify using the Clang option
@@ -3400,7 +3402,8 @@ Code Object V3 Metadata
+++++++++++++++++++++++
.. warning::
- Code object V3 generation is no longer supported by this version of LLVM.
+ Code object V3 is not the default code object version emitted by this version
+ of LLVM.
Code object V3 and above metadata is specified by the ``NT_AMDGPU_METADATA`` note
record (see :ref:`amdgpu-note-records-v3-onwards`).
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 8d0ef67a615dfc6..aadc4a68ea13278 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -341,6 +341,9 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
switch (CodeObjectVersion) {
+ case AMDGPU::AMDHSA_COV3:
+ HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3());
+ break;
case AMDGPU::AMDHSA_COV4:
HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b51a876750b58b0..5060cd3aec581ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -49,14 +49,14 @@ namespace AMDGPU {
namespace HSAMD {
//===----------------------------------------------------------------------===//
-// HSAMetadataStreamerV4
+// HSAMetadataStreamerV3
//===----------------------------------------------------------------------===//
-void MetadataStreamerMsgPackV4::dump(StringRef HSAMetadataString) const {
+void MetadataStreamerMsgPackV3::dump(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
}
-void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const {
+void MetadataStreamerMsgPackV3::verify(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata Parser Test: ";
msgpack::Document FromHSAMetadataString;
@@ -78,7 +78,7 @@ void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const {
}
std::optional<StringRef>
-MetadataStreamerMsgPackV4::getAccessQualifier(StringRef AccQual) const {
+MetadataStreamerMsgPackV3::getAccessQualifier(StringRef AccQual) const {
return StringSwitch<std::optional<StringRef>>(AccQual)
.Case("read_only", StringRef("read_only"))
.Case("write_only", StringRef("write_only"))
@@ -86,7 +86,7 @@ MetadataStreamerMsgPackV4::getAccessQualifier(StringRef AccQual) const {
.Default(std::nullopt);
}
-std::optional<StringRef> MetadataStreamerMsgPackV4::getAddressSpaceQualifier(
+std::optional<StringRef> MetadataStreamerMsgPackV3::getAddressSpaceQualifier(
unsigned AddressSpace) const {
switch (AddressSpace) {
case AMDGPUAS::PRIVATE_ADDRESS:
@@ -107,7 +107,7 @@ std::optional<StringRef> MetadataStreamerMsgPackV4::getAddressSpaceQualifier(
}
StringRef
-MetadataStreamerMsgPackV4::getValueKind(Type *Ty, StringRef TypeQual,
+MetadataStreamerMsgPackV3::getValueKind(Type *Ty, StringRef TypeQual,
StringRef BaseTypeName) const {
if (TypeQual.contains("pipe"))
return "pipe";
@@ -134,7 +134,7 @@ MetadataStreamerMsgPackV4::getValueKind(Type *Ty, StringRef TypeQual,
: "by_value");
}
-std::string MetadataStreamerMsgPackV4::getTypeName(Type *Ty,
+std::string MetadataStreamerMsgPackV3::getTypeName(Type *Ty,
bool Signed) const {
switch (Ty->getTypeID()) {
case Type::IntegerTyID: {
@@ -173,7 +173,7 @@ std::string MetadataStreamerMsgPackV4::getTypeName(Type *Ty,
}
msgpack::ArrayDocNode
-MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const {
+MetadataStreamerMsgPackV3::getWorkGroupDimensions(MDNode *Node) const {
auto Dims = HSAMetadataDoc->getArrayNode();
if (Node->getNumOperands() != 3)
return Dims;
@@ -184,20 +184,14 @@ MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const {
return Dims;
}
-void MetadataStreamerMsgPackV4::emitVersion() {
+void MetadataStreamerMsgPackV3::emitVersion() {
auto Version = HSAMetadataDoc->getArrayNode();
- Version.push_back(Version.getDocument()->getNode(VersionMajorV4));
- Version.push_back(Version.getDocument()->getNode(VersionMinorV4));
+ Version.push_back(Version.getDocument()->getNode(VersionMajorV3));
+ Version.push_back(Version.getDocument()->getNode(VersionMinorV3));
getRootMetadata("amdhsa.version") = Version;
}
-void MetadataStreamerMsgPackV4::emitTargetID(
- const IsaInfo::AMDGPUTargetID &TargetID) {
- getRootMetadata("amdhsa.target") =
- HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true);
-}
-
-void MetadataStreamerMsgPackV4::emitPrintf(const Module &Mod) {
+void MetadataStreamerMsgPackV3::emitPrintf(const Module &Mod) {
auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
if (!Node)
return;
@@ -210,7 +204,7 @@ void MetadataStreamerMsgPackV4::emitPrintf(const Module &Mod) {
getRootMetadata("amdhsa.printf") = Printf;
}
-void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func,
+void MetadataStreamerMsgPackV3::emitKernelLanguage(const Function &Func,
msgpack::MapDocNode Kern) {
// TODO: What about other languages?
auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
@@ -229,7 +223,7 @@ void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func,
Kern[".language_version"] = LanguageVersion;
}
-void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func,
+void MetadataStreamerMsgPackV3::emitKernelAttrs(const Function &Func,
msgpack::MapDocNode Kern) {
if (auto Node = Func.getMetadata("reqd_work_group_size"))
@@ -254,7 +248,7 @@ void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func,
Kern[".kind"] = Kern.getDocument()->getNode("fini");
}
-void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
+void MetadataStreamerMsgPackV3::emitKernelArgs(const MachineFunction &MF,
msgpack::MapDocNode Kern) {
auto &Func = MF.getFunction();
unsigned Offset = 0;
@@ -267,7 +261,7 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
Kern[".args"] = Args;
}
-void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
+void MetadataStreamerMsgPackV3::emitKernelArg(const Argument &Arg,
unsigned &Offset,
msgpack::ArrayDocNode Args) {
auto Func = Arg.getParent();
@@ -332,7 +326,7 @@ void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
AccQual, TypeQual);
}
-void MetadataStreamerMsgPackV4::emitKernelArg(
+void MetadataStreamerMsgPackV3::emitKernelArg(
const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind,
unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign,
StringRef Name, StringRef TypeName, StringRef BaseTypeName,
@@ -381,7 +375,7 @@ void MetadataStreamerMsgPackV4::emitKernelArg(
Args.push_back(Arg);
}
-void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
+void MetadataStreamerMsgPackV3::emitHiddenKernelArgs(
const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) {
auto &Func = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -454,10 +448,9 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
}
}
-msgpack::MapDocNode
-MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo,
- unsigned CodeObjectVersion) const {
+msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps(
+ const MachineFunction &MF, const SIProgramInfo &ProgramInfo,
+ unsigned CodeObjectVersion) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const Function &F = MF.getFunction();
@@ -502,19 +495,18 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
return Kern;
}
-bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+bool MetadataStreamerMsgPackV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
}
-void MetadataStreamerMsgPackV4::begin(const Module &Mod,
+void MetadataStreamerMsgPackV3::begin(const Module &Mod,
const IsaInfo::AMDGPUTargetID &TargetID) {
emitVersion();
- emitTargetID(TargetID);
emitPrintf(Mod);
getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
}
-void MetadataStreamerMsgPackV4::end() {
+void MetadataStreamerMsgPackV3::end() {
std::string HSAMetadataString;
raw_string_ostream StrOS(HSAMetadataString);
HSAMetadataDoc->toYAML(StrOS);
@@ -525,7 +517,7 @@ void MetadataStreamerMsgPackV4::end() {
verify(StrOS.str());
}
-void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,
+void MetadataStreamerMsgPackV3::emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) {
auto &Func = MF.getFunction();
if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
@@ -550,6 +542,31 @@ void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,
Kernels.push_back(Kern);
}
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV4
+//===----------------------------------------------------------------------===//
+
+void MetadataStreamerMsgPackV4::emitVersion() {
+ auto Version = HSAMetadataDoc->getArrayNode();
+ Version.push_back(Version.getDocument()->getNode(VersionMajorV4));
+ Version.push_back(Version.getDocument()->getNode(VersionMinorV4));
+ getRootMetadata("amdhsa.version") = Version;
+}
+
+void MetadataStreamerMsgPackV4::emitTargetID(
+ const IsaInfo::AMDGPUTargetID &TargetID) {
+ getRootMetadata("amdhsa.target") =
+ HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true);
+}
+
+void MetadataStreamerMsgPackV4::begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) {
+ emitVersion();
+ emitTargetID(TargetID);
+ emitPrintf(Mod);
+ getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
+}
+
//===----------------------------------------------------------------------===//
// HSAMetadataStreamerV5
//===----------------------------------------------------------------------===//
@@ -663,7 +680,7 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
void MetadataStreamerMsgPackV5::emitKernelAttrs(const Function &Func,
msgpack::MapDocNode Kern) {
- MetadataStreamerMsgPackV4::emitKernelAttrs(Func, Kern);
+ MetadataStreamerMsgPackV3::emitKernelAttrs(Func, Kern);
if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool())
Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 18a7b5d7a9633e8..d2b3b8917ce0f70 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -62,7 +62,7 @@ class MetadataStreamer {
msgpack::MapDocNode Kern) = 0;
};
-class MetadataStreamerMsgPackV4 : public MetadataStreamer {
+class MetadataStreamerMsgPackV3 : public MetadataStreamer {
protected:
std::unique_ptr<msgpack::Document> HSAMetadataDoc =
std::make_unique<msgpack::Document>();
@@ -89,8 +89,6 @@ class MetadataStreamerMsgPackV4 : public MetadataStreamer {
void emitVersion() override;
- void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);
-
void emitPrintf(const Module &Mod);
void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern);
@@ -122,8 +120,8 @@ class MetadataStreamerMsgPackV4 : public MetadataStreamer {
}
public:
- MetadataStreamerMsgPackV4() = default;
- ~MetadataStreamerMsgPackV4() = default;
+ MetadataStreamerMsgPackV3() = default;
+ ~MetadataStreamerMsgPackV3() = default;
bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
@@ -136,6 +134,19 @@ class MetadataStreamerMsgPackV4 : public MetadataStreamer {
const SIProgramInfo &ProgramInfo) override;
};
+class MetadataStreamerMsgPackV4 : public MetadataStreamerMsgPackV3 {
+protected:
+ void emitVersion() override;
+ void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);
+
+public:
+ MetadataStreamerMsgPackV4() = default;
+ ~MetadataStreamerMsgPackV4() = default;
+
+ void begin(const Module &Mod,
+ const IsaInfo::AMDGPUTargetID &TargetID) override;
+};
+
class MetadataStreamerMsgPackV5 final : public MetadataStreamerMsgPackV4 {
protected:
void emitVersion() override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 21abfb42d11ba21..3d70ed150df12f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6478,6 +6478,11 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return legalizeTrapEndpgm(MI, MRI, B);
+ const Module *M = B.getMF().getFunction().getParent();
+ unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
+ if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
+ return legalizeTrapHsaQueuePtr(MI, MRI, B);
+
return ST.supportsGetDoorbellID() ?
legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 42af09e27e471e8..6b8c03c1620d26b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -424,6 +424,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
switch (CodeObjectVersion) {
default:
break;
+ case AMDGPU::AMDHSA_COV3:
case AMDGPU::AMDHSA_COV4:
case AMDGPU::AMDHSA_COV5:
if (getTargetID()->isXnackSupported())
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9c5b166c9652238..f8f1e6d6c9097cc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5992,6 +5992,11 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return lowerTrapEndpgm(Op, DAG);
+ const Module *M = DAG.getMachineFunction().getFunction().getParent();
+ unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
+ if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
+ return lowerTrapHsaQueuePtr(Op, DAG);
+
return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
lowerTrapHsaQueuePtr(Op, DAG);
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5fff19eada75dd8..d123b384a27d4cc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -128,6 +128,8 @@ std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
return std::nullopt;
switch (AmdhsaCodeObjectVersion) {
+ case 3:
+ return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
case 4:
return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
case 5:
@@ -138,6 +140,12 @@ std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
}
}
+bool isHsaAbiVersion3(const MCSubtargetInfo *STI) {
+ if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+ return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+ return false;
+}
+
bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4;
@@ -166,6 +174,7 @@ unsigned getCodeObjectVersion(const Module &M) {
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) {
switch (CodeObjectVersion) {
+ case AMDHSA_COV3:
case AMDHSA_COV4:
return 48;
case AMDHSA_COV5:
@@ -179,6 +188,7 @@ unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) {
// central TD file.
unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) {
switch (CodeObjectVersion) {
+ case AMDHSA_COV3:
case AMDHSA_COV4:
return 24;
case AMDHSA_COV5:
@@ -189,6 +199,7 @@ unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) {
unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) {
switch (CodeObjectVersion) {
+ case AMDHSA_COV3:
case AMDHSA_COV4:
return 32;
case AMDHSA_COV5:
@@ -199,6 +210,7 @@ unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) {
unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) {
switch (CodeObjectVersion) {
+ case AMDHSA_COV3:
case AMDHSA_COV4:
return 40;
case AMDHSA_COV5:
@@ -762,6 +774,15 @@ std::string AMDGPUTargetID::toString() const {
std::string Features;
if (STI.getTargetTriple().getOS() == Triple::AMDHSA) {
switch (CodeObjectVersion) {
+ case AMDGPU::AMDHSA_COV3:
+ // xnack.
+ if (isXnackOnOrAny())
+ Features += "+xnack";
+ // In code object v2 and v3, "sramecc" feature was spelled with a
+ // hyphen ("sram-ecc").
+ if (isSramEccOnOrAny())
+ Features += "+sram-ecc";
+ break;
case AMDGPU::AMDHSA_COV4:
case AMDGPU::AMDHSA_COV5:
// sramecc.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1e0994d0862cf5d..bb2964f592f66bf 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -42,12 +42,19 @@ namespace AMDGPU {
struct IsaVersion;
-enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5 };
+enum {
+ AMDHSA_COV3 = 3,
+ AMDHSA_COV4 = 4,
+ AMDHSA_COV5 = 5
+};
/// \returns True if \p STI is AMDHSA.
bool isHsaAbi(const MCSubtargetInfo &STI);
/// \returns HSA OS ABI Version identification.
std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 3,
+/// false otherwise.
+bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 4,
/// false otherwise.
bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 4bdbe6604782a8b..c25ecafa1f7c074 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -1,11 +1,38 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V3 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V3 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
+; GFX8V3-LABEL: addrspacecast:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40
+; GFX8V3-NEXT: v_mov_b32_e32 v2, 1
+; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT: s_mov_b32 s4, s0
+; GFX8V3-NEXT: s_mov_b32 s5, s3
+; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1
+; GFX8V3-NEXT: s_cselect_b64 s[4:5], s[4:5], 0
+; GFX8V3-NEXT: s_mov_b32 s6, s1
+; GFX8V3-NEXT: s_mov_b32 s7, s2
+; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s4
+; GFX8V3-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s5
+; GFX8V3-NEXT: flat_store_dword v[0:1], v2
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT: v_mov_b32_e32 v2, 2
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s1
+; GFX8V3-NEXT: flat_store_dword v[0:1], v2
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: s_endpgm
+;
; GFX8V4-LABEL: addrspacecast:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -55,6 +82,30 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: s_endpgm
;
+; GFX9V3-LABEL: addrspacecast:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V3-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX9V3-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9V3-NEXT: v_mov_b32_e32 v2, 1
+; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT: s_mov_b32 s2, s0
+; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1
+; GFX9V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9V3-NEXT: s_mov_b32 s4, s1
+; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1
+; GFX9V3-NEXT: v_mov_b32_e32 v0, s2
+; GFX9V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V3-NEXT: v_mov_b32_e32 v1, s3
+; GFX9V3-NEXT: flat_store_dword v[0:1], v2
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX9V3-NEXT: v_mov_b32_e32 v2, 2
+; GFX9V3-NEXT: v_mov_b32_e32 v1, s1
+; GFX9V3-NEXT: flat_store_dword v[0:1], v2
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: s_endpgm
+;
; GFX9V4-LABEL: addrspacecast:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -110,6 +161,19 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
}
define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
+; GFX8V3-LABEL: llvm_amdgcn_is_shared:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x40
+; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0
+; GFX8V3-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT: flat_store_dword v[0:1], v0
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: s_endpgm
+;
; GFX8V4-LABEL: llvm_amdgcn_is_shared:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -136,6 +200,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: s_endpgm
;
+; GFX9V3-LABEL: llvm_amdgcn_is_shared:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V3-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT: s_cmp_eq_u32 s1, s3
+; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0
+; GFX9V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX9V3-NEXT: global_store_dword v[0:1], v0, off
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: s_endpgm
+;
; GFX9V4-LABEL: llvm_amdgcn_is_shared:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -166,6 +242,19 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
}
define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
+; GFX8V3-LABEL: llvm_amdgcn_is_private:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0
+; GFX8V3-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT: flat_store_dword v[0:1], v0
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: s_endpgm
+;
; GFX8V4-LABEL: llvm_amdgcn_is_private:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -192,6 +281,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: s_endpgm
;
+; GFX9V3-LABEL: llvm_amdgcn_is_private:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V3-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT: s_cmp_eq_u32 s1, s3
+; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0
+; GFX9V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX9V3-NEXT: global_store_dword v[0:1], v0, off
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: s_endpgm
+;
; GFX9V4-LABEL: llvm_amdgcn_is_private:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -222,6 +323,11 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
}
define amdgpu_kernel void @llvm_trap() {
+; GFX8V3-LABEL: llvm_trap:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: s_mov_b64 s[0:1], s[4:5]
+; GFX8V3-NEXT: s_trap 2
+;
; GFX8V4-LABEL: llvm_trap:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5]
@@ -233,6 +339,11 @@ define amdgpu_kernel void @llvm_trap() {
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_trap 2
;
+; GFX9V3-LABEL: llvm_trap:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: s_mov_b64 s[0:1], s[4:5]
+; GFX9V3-NEXT: s_trap 2
+;
; GFX9V4-LABEL: llvm_trap:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_trap 2
@@ -245,6 +356,10 @@ define amdgpu_kernel void @llvm_trap() {
}
define amdgpu_kernel void @llvm_debugtrap() {
+; GFX8V3-LABEL: llvm_debugtrap:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: s_trap 3
+;
; GFX8V4-LABEL: llvm_debugtrap:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_trap 3
@@ -253,6 +368,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_trap 3
;
+; GFX9V3-LABEL: llvm_debugtrap:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: s_trap 3
+;
; GFX9V4-LABEL: llvm_debugtrap:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_trap 3
@@ -265,6 +384,32 @@ define amdgpu_kernel void @llvm_debugtrap() {
}
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
+; GFX8V3-LABEL: llvm_amdgcn_queue_ptr:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s6
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s7
+; GFX8V3-NEXT: s_add_u32 s0, s8, 8
+; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT: s_addc_u32 s1, s9, 0
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s1
+; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s4
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s5
+; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s10
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s11
+; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT: v_mov_b32_e32 v3, s1
+; GFX8V3-NEXT: v_mov_b32_e32 v2, s0
+; GFX8V3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: s_endpgm
+;
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
@@ -315,6 +460,23 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: s_endpgm
;
+; GFX9V3-LABEL: llvm_amdgcn_queue_ptr:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: v_mov_b32_e32 v2, 0
+; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc
+; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
+; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc
+; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: v_mov_b32_e32 v0, s10
+; GFX9V3-NEXT: v_mov_b32_e32 v1, s11
+; GFX9V3-NEXT: ; kill: killed $sgpr6_sgpr7
+; GFX9V3-NEXT: ; kill: killed $sgpr4_sgpr5
+; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: s_endpgm
+;
; GFX9V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
new file mode 100644
index 000000000000000..20d0aea61f27688
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
@@ -0,0 +1,148 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -amdgpu-verify-hsa-metadata -filetype=obj -o /dev/null < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK-LABEL: {{^}}min_64_max_64:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_64_max_64() #0 {
+entry:
+ ret void
+}
+attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
+
+; CHECK-LABEL: {{^}}min_64_max_128:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_64_max_128() #1 {
+entry:
+ ret void
+}
+attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
+
+; CHECK-LABEL: {{^}}min_128_max_128:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_128_max_128() #2 {
+entry:
+ ret void
+}
+attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
+
+; CHECK-LABEL: {{^}}min_1024_max_1024
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 10
+; CHECK: NumSGPRsForWavesPerEU: 2{{$}}
+; CHECK: NumVGPRsForWavesPerEU: 43
+ at var = addrspace(1) global float 0.0
+define amdgpu_kernel void @min_1024_max_1024() #3 {
+ %val0 = load volatile float, ptr addrspace(1) @var
+ %val1 = load volatile float, ptr addrspace(1) @var
+ %val2 = load volatile float, ptr addrspace(1) @var
+ %val3 = load volatile float, ptr addrspace(1) @var
+ %val4 = load volatile float, ptr addrspace(1) @var
+ %val5 = load volatile float, ptr addrspace(1) @var
+ %val6 = load volatile float, ptr addrspace(1) @var
+ %val7 = load volatile float, ptr addrspace(1) @var
+ %val8 = load volatile float, ptr addrspace(1) @var
+ %val9 = load volatile float, ptr addrspace(1) @var
+ %val10 = load volatile float, ptr addrspace(1) @var
+ %val11 = load volatile float, ptr addrspace(1) @var
+ %val12 = load volatile float, ptr addrspace(1) @var
+ %val13 = load volatile float, ptr addrspace(1) @var
+ %val14 = load volatile float, ptr addrspace(1) @var
+ %val15 = load volatile float, ptr addrspace(1) @var
+ %val16 = load volatile float, ptr addrspace(1) @var
+ %val17 = load volatile float, ptr addrspace(1) @var
+ %val18 = load volatile float, ptr addrspace(1) @var
+ %val19 = load volatile float, ptr addrspace(1) @var
+ %val20 = load volatile float, ptr addrspace(1) @var
+ %val21 = load volatile float, ptr addrspace(1) @var
+ %val22 = load volatile float, ptr addrspace(1) @var
+ %val23 = load volatile float, ptr addrspace(1) @var
+ %val24 = load volatile float, ptr addrspace(1) @var
+ %val25 = load volatile float, ptr addrspace(1) @var
+ %val26 = load volatile float, ptr addrspace(1) @var
+ %val27 = load volatile float, ptr addrspace(1) @var
+ %val28 = load volatile float, ptr addrspace(1) @var
+ %val29 = load volatile float, ptr addrspace(1) @var
+ %val30 = load volatile float, ptr addrspace(1) @var
+ %val31 = load volatile float, ptr addrspace(1) @var
+ %val32 = load volatile float, ptr addrspace(1) @var
+ %val33 = load volatile float, ptr addrspace(1) @var
+ %val34 = load volatile float, ptr addrspace(1) @var
+ %val35 = load volatile float, ptr addrspace(1) @var
+ %val36 = load volatile float, ptr addrspace(1) @var
+ %val37 = load volatile float, ptr addrspace(1) @var
+ %val38 = load volatile float, ptr addrspace(1) @var
+ %val39 = load volatile float, ptr addrspace(1) @var
+ %val40 = load volatile float, ptr addrspace(1) @var
+
+ store volatile float %val0, ptr addrspace(1) @var
+ store volatile float %val1, ptr addrspace(1) @var
+ store volatile float %val2, ptr addrspace(1) @var
+ store volatile float %val3, ptr addrspace(1) @var
+ store volatile float %val4, ptr addrspace(1) @var
+ store volatile float %val5, ptr addrspace(1) @var
+ store volatile float %val6, ptr addrspace(1) @var
+ store volatile float %val7, ptr addrspace(1) @var
+ store volatile float %val8, ptr addrspace(1) @var
+ store volatile float %val9, ptr addrspace(1) @var
+ store volatile float %val10, ptr addrspace(1) @var
+ store volatile float %val11, ptr addrspace(1) @var
+ store volatile float %val12, ptr addrspace(1) @var
+ store volatile float %val13, ptr addrspace(1) @var
+ store volatile float %val14, ptr addrspace(1) @var
+ store volatile float %val15, ptr addrspace(1) @var
+ store volatile float %val16, ptr addrspace(1) @var
+ store volatile float %val17, ptr addrspace(1) @var
+ store volatile float %val18, ptr addrspace(1) @var
+ store volatile float %val19, ptr addrspace(1) @var
+ store volatile float %val20, ptr addrspace(1) @var
+ store volatile float %val21, ptr addrspace(1) @var
+ store volatile float %val22, ptr addrspace(1) @var
+ store volatile float %val23, ptr addrspace(1) @var
+ store volatile float %val24, ptr addrspace(1) @var
+ store volatile float %val25, ptr addrspace(1) @var
+ store volatile float %val26, ptr addrspace(1) @var
+ store volatile float %val27, ptr addrspace(1) @var
+ store volatile float %val28, ptr addrspace(1) @var
+ store volatile float %val29, ptr addrspace(1) @var
+ store volatile float %val30, ptr addrspace(1) @var
+ store volatile float %val31, ptr addrspace(1) @var
+ store volatile float %val32, ptr addrspace(1) @var
+ store volatile float %val33, ptr addrspace(1) @var
+ store volatile float %val34, ptr addrspace(1) @var
+ store volatile float %val35, ptr addrspace(1) @var
+ store volatile float %val36, ptr addrspace(1) @var
+ store volatile float %val37, ptr addrspace(1) @var
+ store volatile float %val38, ptr addrspace(1) @var
+ store volatile float %val39, ptr addrspace(1) @var
+ store volatile float %val40, ptr addrspace(1) @var
+
+ ret void
+}
+attributes #3 = {"amdgpu-flat-work-group-size"="1024,1024"}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
+
+; CHECK: amdhsa.kernels:
+; CHECK: .max_flat_workgroup_size: 64
+; CHECK: .name: min_64_max_64
+; CHECK: .max_flat_workgroup_size: 128
+; CHECK: .name: min_64_max_128
+; CHECK: .max_flat_workgroup_size: 128
+; CHECK: .name: min_128_max_128
+; CHECK: .max_flat_workgroup_size: 1024
+; CHECK: .name: min_1024_max_1024
+; CHECK: amdhsa.version:
+; CHECK: - 1
+; CHECK: - 0
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS
diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll
new file mode 100644
index 000000000000000..6c553e3726abf90
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll
@@ -0,0 +1,168 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=V3-GFX600 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck --check-prefixes=V3-GFX600 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx601 < %s | FileCheck --check-prefixes=V3-GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=pitcairn < %s | FileCheck --check-prefixes=V3-GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=verde < %s | FileCheck --check-prefixes=V3-GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx602 < %s | FileCheck --check-prefixes=V3-GFX602 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hainan < %s | FileCheck --check-prefixes=V3-GFX602 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=oland < %s | FileCheck --check-prefixes=V3-GFX602 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=V3-GFX700 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=V3-GFX700 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx701 < %s | FileCheck --check-prefixes=V3-GFX701 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck --check-prefixes=V3-GFX701 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx702 < %s | FileCheck --check-prefixes=V3-GFX702 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck --check-prefixes=V3-GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kabini < %s | FileCheck --check-prefixes=V3-GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=mullins < %s | FileCheck --check-prefixes=V3-GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 < %s | FileCheck --check-prefixes=V3-GFX704 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck --check-prefixes=V3-GFX704 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx705 < %s | FileCheck --check-prefixes=V3-GFX705 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX801-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX801-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 < %s | FileCheck --check-prefixes=V3-GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland < %s | FileCheck --check-prefixes=V3-GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck --check-prefixes=V3-GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefixes=V3-GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck --check-prefixes=V3-GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris10 < %s | FileCheck --check-prefixes=V3-GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris11 < %s | FileCheck --check-prefixes=V3-GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx805 < %s | FileCheck --check-prefixes=V3-GFX805 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tongapro < %s | FileCheck --check-prefixes=V3-GFX805 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX810-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX810-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=V3-GFX900-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX900-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX900-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 < %s | FileCheck --check-prefixes=V3-GFX902-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX902-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX902-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 < %s | FileCheck --check-prefixes=V3-GFX904-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX904-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX904-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-sramecc < %s | FileCheck --check-prefixes=V3-GFX906-NOSRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX906-NOSRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX906-NOSRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-sramecc < %s | FileCheck --check-prefixes=V3-GFX908-NOSRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+sramecc < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX908-NOSRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX908-NOSRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 < %s | FileCheck --check-prefixes=V3-GFX909-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX909-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX909-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c < %s | FileCheck --check-prefixes=V3-GFX90C-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX90C-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX90C-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefixes=V3-GFX940-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX940-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX940-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=V3-GFX1010-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1010-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1010-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 < %s | FileCheck --check-prefixes=V3-GFX1011-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1011-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1011-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 < %s | FileCheck --check-prefixes=V3-GFX1012-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1012-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1012-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 < %s | FileCheck --check-prefixes=V3-GFX1013-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1013-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1013-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck --check-prefixes=V3-GFX1030 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck --check-prefixes=V3-GFX1031 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1032 < %s | FileCheck --check-prefixes=V3-GFX1032 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1033 < %s | FileCheck --check-prefixes=V3-GFX1033 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1034 < %s | FileCheck --check-prefixes=V3-GFX1034 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1035 < %s | FileCheck --check-prefixes=V3-GFX1035 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1036 < %s | FileCheck --check-prefixes=V3-GFX1036 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=V3-GFX1100 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 < %s | FileCheck --check-prefixes=V3-GFX1101 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck --check-prefixes=V3-GFX1102 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1103 < %s | FileCheck --check-prefixes=V3-GFX1103 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1150 < %s | FileCheck --check-prefixes=V3-GFX1150 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1151 < %s | FileCheck --check-prefixes=V3-GFX1151 %s
+
+; V3-GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600"
+; V3-GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601"
+; V3-GFX602: .amdgcn_target "amdgcn-amd-amdhsa--gfx602"
+; V3-GFX700: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
+; V3-GFX701: .amdgcn_target "amdgcn-amd-amdhsa--gfx701"
+; V3-GFX702: .amdgcn_target "amdgcn-amd-amdhsa--gfx702"
+; V3-GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703"
+; V3-GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704"
+; V3-GFX705: .amdgcn_target "amdgcn-amd-amdhsa--gfx705"
+; V3-GFX801-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx801"
+; V3-GFX801-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack"
+; V3-GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802"
+; V3-GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803"
+; V3-GFX805: .amdgcn_target "amdgcn-amd-amdhsa--gfx805"
+; V3-GFX810-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx810"
+; V3-GFX810-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack"
+; V3-GFX900-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx900"
+; V3-GFX900-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack"
+; V3-GFX902-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx902"
+; V3-GFX902-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack"
+; V3-GFX904-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx904"
+; V3-GFX904-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack"
+; V3-GFX906-NOSRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906"
+; V3-GFX906-SRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+sram-ecc"
+; V3-GFX906-NOSRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack"
+; V3-GFX906-SRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sram-ecc"
+; V3-GFX908-NOSRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908"
+; V3-GFX908-SRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+sram-ecc"
+; V3-GFX908-NOSRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+xnack"
+; V3-GFX908-SRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+xnack+sram-ecc"
+; V3-GFX909-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx909"
+; V3-GFX909-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx909+xnack"
+; V3-GFX90C-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx90c"
+; V3-GFX90C-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx90c+xnack"
+; V3-GFX940-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx940+sram-ecc"
+; V3-GFX940-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx940+xnack+sram-ecc"
+; V3-GFX1010-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010"
+; V3-GFX1010-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010+xnack"
+; V3-GFX1011-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1011"
+; V3-GFX1011-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1011+xnack"
+; V3-GFX1012-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1012"
+; V3-GFX1012-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1012+xnack"
+; V3-GFX1013-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1013"
+; V3-GFX1013-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1013+xnack"
+; V3-GFX1030: .amdgcn_target "amdgcn-amd-amdhsa--gfx1030"
+; V3-GFX1031: .amdgcn_target "amdgcn-amd-amdhsa--gfx1031"
+; V3-GFX1032: .amdgcn_target "amdgcn-amd-amdhsa--gfx1032"
+; V3-GFX1033: .amdgcn_target "amdgcn-amd-amdhsa--gfx1033"
+; V3-GFX1034: .amdgcn_target "amdgcn-amd-amdhsa--gfx1034"
+; V3-GFX1035: .amdgcn_target "amdgcn-amd-amdhsa--gfx1035"
+; V3-GFX1036: .amdgcn_target "amdgcn-amd-amdhsa--gfx1036"
+; V3-GFX1100: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100"
+; V3-GFX1101: .amdgcn_target "amdgcn-amd-amdhsa--gfx1101"
+; V3-GFX1102: .amdgcn_target "amdgcn-amd-amdhsa--gfx1102"
+; V3-GFX1103: .amdgcn_target "amdgcn-amd-amdhsa--gfx1103"
+; V3-GFX1150: .amdgcn_target "amdgcn-amd-amdhsa--gfx1150"
+; V3-GFX1151: .amdgcn_target "amdgcn-amd-amdhsa--gfx1151"
+
+
+
+define amdgpu_kernel void @directive_amdgcn_target() {
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll
similarity index 98%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll
index 042abe382283a65..37b124e7f59a09f 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll
@@ -142,7 +142,7 @@ define amdgpu_kernel void @test_no_default_queue(i8 %a) #3
; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
; CHECK-NOT: amdhsa.printf:
attributes #0 = { optnone noinline "amdgpu-no-default-queue" "amdgpu-no-completion-action" "amdgpu-implicitarg-num-bytes"="48" }
@@ -151,7 +151,7 @@ attributes #2 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-implici
attributes #3 = { optnone noinline "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="48" }
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
!1 = !{i32 0}
!2 = !{!"none"}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
index fb08fd2c45085a1..8e8023aa16f1303 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
@@ -39,4 +39,4 @@ define internal void @bar.5() {
; PARSER: AMDGPU HSA Metadata Parser Test: PASS
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
index dc3a6e8b633b233..69efc47008e6aad 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
@@ -1739,14 +1739,14 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(ptr addrspace(12345) %ptr)
; CHECK-NEXT: - '2:1:8:%g\n'
; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
attributes #0 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" }
attributes #1 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
!llvm.printf.fmts = !{!100, !101}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
index f4892ebdc9c9374..47b882494c9191a 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
@@ -286,7 +286,7 @@ entry:
; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to
; avoid optimizing out the implicit argument allocation.
@@ -298,4 +298,4 @@ attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll
similarity index 96%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll
index 22c6e147762200d..cb3ae289721bc17 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll
@@ -35,12 +35,12 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
attributes #0 = { sanitize_address "amdgpu-implicitarg-num-bytes"="48" }
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
!1 = !{i32 0}
!2 = !{!"none"}
!3 = !{!"char"}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll
index 8f90025fe8e29c1..a3f8c5cff95df85 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll
@@ -296,4 +296,4 @@ attributes #3 = { "amdgpu-implicitarg-num-bytes"="48" "amdgpu-no-hostcall-ptr" }
attributes #4 = { noinline }
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
similarity index 98%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
index 6d49f22eb429b24..b7f58bbb51bb29d 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
@@ -96,10 +96,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %a,
; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
!1 = !{!"image1d_t", !"image1d_array_t", !"image1d_buffer_t",
!"image2d_t", !"image2d_array_t", !"image2d_array_depth_t",
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
similarity index 80%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
index fc5e6e273125334..8117037baaffc9d 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
@@ -5,9 +5,9 @@
; CHECK: ---
; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
; CHECK: ...
!opencl.ocl.version = !{}
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
similarity index 81%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
index 1ec79c95bc2a308..ea744863a9b8875 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
@@ -5,10 +5,10 @@
; CHECK: ---
; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
; CHECK: ...
!opencl.ocl.version = !{!0}
!llvm.module.flags = !{!1}
!0 = !{i32 1}
-!1 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!1 = !{i32 1, !"amdgpu_code_object_version", i32 300}
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index e45c4d1786faf67..d6f7a92af9dcb6f 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -158,11 +158,11 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 {
; CHECK: amdhsa.version:
; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
attributes #0 = { "amdgpu-num-sgpr"="14" }
attributes #1 = { "amdgpu-num-vgpr"="20" }
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 9e6c0ef86906ddb..9760e93eb48e6bd 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -1,11 +1,36 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V3 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V3 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V4 %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
+; GFX8V3-LABEL: addrspacecast:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40
+; GFX8V3-NEXT: v_mov_b32_e32 v4, 1
+; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1
+; GFX8V3-NEXT: s_cselect_b32 s3, s3, 0
+; GFX8V3-NEXT: s_cselect_b32 s0, s0, 0
+; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s3
+; GFX8V3-NEXT: s_cselect_b32 s0, s2, 0
+; GFX8V3-NEXT: s_cselect_b32 s1, s1, 0
+; GFX8V3-NEXT: v_mov_b32_e32 v2, s1
+; GFX8V3-NEXT: v_mov_b32_e32 v3, s0
+; GFX8V3-NEXT: flat_store_dword v[0:1], v4
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: v_mov_b32_e32 v0, 2
+; GFX8V3-NEXT: flat_store_dword v[2:3], v0
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: s_endpgm
+;
; GFX8V4-LABEL: addrspacecast:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -52,6 +77,30 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: s_endpgm
;
+; GFX9V3-LABEL: addrspacecast:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V3-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX9V3-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX9V3-NEXT: v_mov_b32_e32 v4, 1
+; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1
+; GFX9V3-NEXT: s_cselect_b32 s2, s3, 0
+; GFX9V3-NEXT: s_cselect_b32 s0, s0, 0
+; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1
+; GFX9V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX9V3-NEXT: v_mov_b32_e32 v1, s2
+; GFX9V3-NEXT: s_cselect_b32 s0, s5, 0
+; GFX9V3-NEXT: s_cselect_b32 s1, s1, 0
+; GFX9V3-NEXT: v_mov_b32_e32 v2, s1
+; GFX9V3-NEXT: v_mov_b32_e32 v3, s0
+; GFX9V3-NEXT: flat_store_dword v[0:1], v4
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: v_mov_b32_e32 v0, 2
+; GFX9V3-NEXT: flat_store_dword v[2:3], v0
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: s_endpgm
+;
; GFX9V4-LABEL: addrspacecast:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -107,6 +156,18 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
}
define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
+; GFX8V3-LABEL: llvm_amdgcn_is_shared:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x40
+; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4
+; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0
+; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V3-NEXT: flat_store_dword v[0:1], v0
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: s_endpgm
+;
; GFX8V4-LABEL: llvm_amdgcn_is_shared:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40
@@ -131,6 +192,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: s_endpgm
;
+; GFX9V3-LABEL: llvm_amdgcn_is_shared:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: s_load_dword s2, s[4:5], 0x4
+; GFX9V3-NEXT: s_mov_b64 s[0:1], src_shared_base
+; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT: s_cmp_eq_u32 s2, s1
+; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V3-NEXT: global_store_dword v[0:1], v0, off
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: s_endpgm
+;
; GFX9V4-LABEL: llvm_amdgcn_is_shared:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4
@@ -161,6 +234,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
}
define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
+; GFX8V3-LABEL: llvm_amdgcn_is_private:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x44
+; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4
+; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0
+; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V3-NEXT: flat_store_dword v[0:1], v0
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: s_endpgm
+;
; GFX8V4-LABEL: llvm_amdgcn_is_private:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44
@@ -185,6 +270,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: s_endpgm
;
+; GFX9V3-LABEL: llvm_amdgcn_is_private:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: s_load_dword s2, s[4:5], 0x4
+; GFX9V3-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT: s_cmp_eq_u32 s2, s1
+; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V3-NEXT: global_store_dword v[0:1], v0, off
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: s_endpgm
+;
; GFX9V4-LABEL: llvm_amdgcn_is_private:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dword s2, s[4:5], 0x4
@@ -215,6 +312,11 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
}
define amdgpu_kernel void @llvm_trap() {
+; GFX8V3-LABEL: llvm_trap:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: s_mov_b64 s[0:1], s[4:5]
+; GFX8V3-NEXT: s_trap 2
+;
; GFX8V4-LABEL: llvm_trap:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5]
@@ -226,6 +328,11 @@ define amdgpu_kernel void @llvm_trap() {
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_trap 2
;
+; GFX9V3-LABEL: llvm_trap:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: s_mov_b64 s[0:1], s[4:5]
+; GFX9V3-NEXT: s_trap 2
+;
; GFX9V4-LABEL: llvm_trap:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_trap 2
@@ -238,6 +345,10 @@ define amdgpu_kernel void @llvm_trap() {
}
define amdgpu_kernel void @llvm_debugtrap() {
+; GFX8V3-LABEL: llvm_debugtrap:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: s_trap 3
+;
; GFX8V4-LABEL: llvm_debugtrap:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_trap 3
@@ -246,6 +357,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_trap 3
;
+; GFX9V3-LABEL: llvm_debugtrap:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: s_trap 3
+;
; GFX9V4-LABEL: llvm_debugtrap:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_trap 3
@@ -258,6 +373,31 @@ define amdgpu_kernel void @llvm_debugtrap() {
}
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
+; GFX8V3-LABEL: llvm_amdgcn_queue_ptr:
+; GFX8V3: ; %bb.0:
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s6
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s7
+; GFX8V3-NEXT: s_add_u32 s0, s8, 8
+; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT: s_addc_u32 s1, s9, 0
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s1
+; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s4
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s5
+; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V3-NEXT: v_mov_b32_e32 v2, s10
+; GFX8V3-NEXT: v_mov_b32_e32 v3, s11
+; GFX8V3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT: v_mov_b32_e32 v1, s1
+; GFX8V3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8V3-NEXT: s_waitcnt vmcnt(0)
+; GFX8V3-NEXT: s_endpgm
+;
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
@@ -306,6 +446,23 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
; GFX8V5-NEXT: s_endpgm
;
+; GFX9V3-LABEL: llvm_amdgcn_queue_ptr:
+; GFX9V3: ; %bb.0:
+; GFX9V3-NEXT: v_mov_b32_e32 v2, 0
+; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc
+; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
+; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc
+; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: v_mov_b32_e32 v0, s10
+; GFX9V3-NEXT: v_mov_b32_e32 v1, s11
+; GFX9V3-NEXT: ; kill: killed $sgpr6_sgpr7
+; GFX9V3-NEXT: ; kill: killed $sgpr4_sgpr5
+; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9V3-NEXT: s_waitcnt vmcnt(0)
+; GFX9V3-NEXT: s_endpgm
+;
; GFX9V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index a04fe28dbffffab..0353e7ee49ab953 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -1,8 +1,17 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=DOORBELL %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=DOORBELL %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA %s
declare void @llvm.trap() #0
+; HSA: .amdhsa_kernel trap
+; HSA-NEXT: .amdhsa_group_segment_fixed_size 0
+; HSA-NEXT: .amdhsa_private_segment_fixed_size 0
+; HSA-NEXT: .amdhsa_kernarg_size 8
+; HSA-NEXT: .amdhsa_user_sgpr_count 8
+; HSA-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
+; HSA: .end_amdhsa_kernel
+
; DOORBELL: .amdhsa_kernel trap
; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0
; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 9ed896c148e6485..792ec2675247f61 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -317,4 +317,4 @@ attributes #1 = { nounwind "stackrealign" }
attributes #2 = { nounwind alignstack=128 }
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 54a15513cf0a506..03ea582698486ce 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -1,54 +1,101 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc %s -o - -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=NOHSA-TRAP-GFX900 %s
-; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803 %s
-; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900 %s
-; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=NOHSA-TRAP-GFX900-V3 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=NOHSA-TRAP-GFX900-V4 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803-V3 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803-V4 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900-V3 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900-V4 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900-V3 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900-V4 %s
declare void @llvm.trap() #0
declare void @llvm.debugtrap() #1
define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
-; NOHSA-TRAP-GFX900-LABEL: trap:
-; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
-; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
-; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
-; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: s_endpgm
-;
-; HSA-TRAP-GFX803-LABEL: trap:
-; HSA-TRAP-GFX803: ; %bb.0:
-; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5]
-; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3
-; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
-; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX803-NEXT: s_trap 2
-;
-; HSA-TRAP-GFX900-LABEL: trap:
-; HSA-TRAP-GFX900: ; %bb.0:
-; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
-; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
-; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
-; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX900-NEXT: s_trap 2
-;
-; HSA-NOTRAP-GFX900-LABEL: trap:
-; HSA-NOTRAP-GFX900: ; %bb.0:
-; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
-; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
-; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
-; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT: s_endpgm
+; NOHSA-TRAP-GFX900-V3-LABEL: trap:
+; NOHSA-TRAP-GFX900-V3: ; %bb.0:
+; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1
+; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT: s_endpgm
+;
+; NOHSA-TRAP-GFX900-V4-LABEL: trap:
+; NOHSA-TRAP-GFX900-V4: ; %bb.0:
+; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
+; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT: s_endpgm
+;
+; HSA-TRAP-GFX803-V3-LABEL: trap:
+; HSA-TRAP-GFX803-V3: ; %bb.0:
+; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-V3-NEXT: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s2
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v1, s3
+; HSA-TRAP-GFX803-V3-NEXT: flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT: s_trap 2
+;
+; HSA-TRAP-GFX803-V4-LABEL: trap:
+; HSA-TRAP-GFX803-V4: ; %bb.0:
+; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-V4-NEXT: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s2
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v1, s3
+; HSA-TRAP-GFX803-V4-NEXT: flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT: s_trap 2
+;
+; HSA-TRAP-GFX900-V3-LABEL: trap:
+; HSA-TRAP-GFX900-V3: ; %bb.0:
+; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
+; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX900-V3-NEXT: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[2:3]
+; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT: s_trap 2
+;
+; HSA-TRAP-GFX900-V4-LABEL: trap:
+; HSA-TRAP-GFX900-V4: ; %bb.0:
+; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT: s_trap 2
+;
+; HSA-NOTRAP-GFX900-V3-LABEL: trap:
+; HSA-NOTRAP-GFX900-V3: ; %bb.0:
+; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1
+; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT: s_endpgm
+;
+; HSA-NOTRAP-GFX900-V4-LABEL: trap:
+; HSA-NOTRAP-GFX900-V4: ; %bb.0:
+; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
+; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT: s_endpgm
store volatile i32 1, ptr addrspace(1) %arg0
call void @llvm.trap()
unreachable
@@ -57,77 +104,150 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
}
define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
-; NOHSA-TRAP-GFX900-LABEL: non_entry_trap:
-; NOHSA-TRAP-GFX900: ; %bb.0: ; %entry
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
-; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
-; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
-; NOHSA-TRAP-GFX900-NEXT: s_cbranch_vccz .LBB1_2
-; NOHSA-TRAP-GFX900-NEXT: ; %bb.1: ; %ret
-; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 3
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
-; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: s_endpgm
-; NOHSA-TRAP-GFX900-NEXT: .LBB1_2: ; %trap
-; NOHSA-TRAP-GFX900-NEXT: s_endpgm
-;
-; HSA-TRAP-GFX803-LABEL: non_entry_trap:
-; HSA-TRAP-GFX803: ; %bb.0: ; %entry
-; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
-; HSA-TRAP-GFX803-NEXT: flat_load_dword v0, v[0:1] glc
-; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX803-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0
-; HSA-TRAP-GFX803-NEXT: s_cbranch_vccz .LBB1_2
-; HSA-TRAP-GFX803-NEXT: ; %bb.1: ; %ret
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 3
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
-; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
-; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX803-NEXT: s_endpgm
-; HSA-TRAP-GFX803-NEXT: .LBB1_2: ; %trap
-; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[4:5]
-; HSA-TRAP-GFX803-NEXT: s_trap 2
-;
-; HSA-TRAP-GFX900-LABEL: non_entry_trap:
-; HSA-TRAP-GFX900: ; %bb.0: ; %entry
-; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
-; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
-; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
-; HSA-TRAP-GFX900-NEXT: s_cbranch_vccz .LBB1_2
-; HSA-TRAP-GFX900-NEXT: ; %bb.1: ; %ret
-; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 3
-; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
-; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX900-NEXT: s_endpgm
-; HSA-TRAP-GFX900-NEXT: .LBB1_2: ; %trap
-; HSA-TRAP-GFX900-NEXT: s_trap 2
-;
-; HSA-NOTRAP-GFX900-LABEL: non_entry_trap:
-; HSA-NOTRAP-GFX900: ; %bb.0: ; %entry
-; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
-; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
-; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
-; HSA-NOTRAP-GFX900-NEXT: s_cbranch_vccz .LBB1_2
-; HSA-NOTRAP-GFX900-NEXT: ; %bb.1: ; %ret
-; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 3
-; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
-; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT: s_endpgm
-; HSA-NOTRAP-GFX900-NEXT: .LBB1_2: ; %trap
-; HSA-NOTRAP-GFX900-NEXT: s_endpgm
+; NOHSA-TRAP-GFX900-V3-LABEL: non_entry_trap:
+; NOHSA-TRAP-GFX900-V3: ; %bb.0: ; %entry
+; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT: global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
+; NOHSA-TRAP-GFX900-V3-NEXT: s_cbranch_vccz .LBB1_2
+; NOHSA-TRAP-GFX900-V3-NEXT: ; %bb.1: ; %ret
+; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 3
+; NOHSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT: s_endpgm
+; NOHSA-TRAP-GFX900-V3-NEXT: .LBB1_2: ; %trap
+; NOHSA-TRAP-GFX900-V3-NEXT: s_endpgm
+;
+; NOHSA-TRAP-GFX900-V4-LABEL: non_entry_trap:
+; NOHSA-TRAP-GFX900-V4: ; %bb.0: ; %entry
+; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
+; NOHSA-TRAP-GFX900-V4-NEXT: s_cbranch_vccz .LBB1_2
+; NOHSA-TRAP-GFX900-V4-NEXT: ; %bb.1: ; %ret
+; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 3
+; NOHSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT: s_endpgm
+; NOHSA-TRAP-GFX900-V4-NEXT: .LBB1_2: ; %trap
+; NOHSA-TRAP-GFX900-V4-NEXT: s_endpgm
+;
+; HSA-TRAP-GFX803-V3-LABEL: non_entry_trap:
+; HSA-TRAP-GFX803-V3: ; %bb.0: ; %entry
+; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V3-NEXT: flat_load_dword v0, v[0:1] glc
+; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0
+; HSA-TRAP-GFX803-V3-NEXT: s_cbranch_vccz .LBB1_2
+; HSA-TRAP-GFX803-V3-NEXT: ; %bb.1: ; %ret
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v2, 3
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V3-NEXT: flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT: s_endpgm
+; HSA-TRAP-GFX803-V3-NEXT: .LBB1_2: ; %trap
+; HSA-TRAP-GFX803-V3-NEXT: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX803-V3-NEXT: s_trap 2
+;
+; HSA-TRAP-GFX803-V4-LABEL: non_entry_trap:
+; HSA-TRAP-GFX803-V4: ; %bb.0: ; %entry
+; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V4-NEXT: flat_load_dword v0, v[0:1] glc
+; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v0
+; HSA-TRAP-GFX803-V4-NEXT: s_cbranch_vccz .LBB1_2
+; HSA-TRAP-GFX803-V4-NEXT: ; %bb.1: ; %ret
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v2, 3
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V4-NEXT: flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT: s_endpgm
+; HSA-TRAP-GFX803-V4-NEXT: .LBB1_2: ; %trap
+; HSA-TRAP-GFX803-V4-NEXT: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX803-V4-NEXT: s_trap 2
+;
+; HSA-TRAP-GFX900-V3-LABEL: non_entry_trap:
+; HSA-TRAP-GFX900-V3: ; %bb.0: ; %entry
+; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT: global_load_dword v1, v0, s[0:1] glc
+; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
+; HSA-TRAP-GFX900-V3-NEXT: s_cbranch_vccz .LBB1_2
+; HSA-TRAP-GFX900-V3-NEXT: ; %bb.1: ; %ret
+; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 3
+; HSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT: s_endpgm
+; HSA-TRAP-GFX900-V3-NEXT: .LBB1_2: ; %trap
+; HSA-TRAP-GFX900-V3-NEXT: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX900-V3-NEXT: s_trap 2
+;
+; HSA-TRAP-GFX900-V4-LABEL: non_entry_trap:
+; HSA-TRAP-GFX900-V4: ; %bb.0: ; %entry
+; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc
+; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
+; HSA-TRAP-GFX900-V4-NEXT: s_cbranch_vccz .LBB1_2
+; HSA-TRAP-GFX900-V4-NEXT: ; %bb.1: ; %ret
+; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 3
+; HSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT: s_endpgm
+; HSA-TRAP-GFX900-V4-NEXT: .LBB1_2: ; %trap
+; HSA-TRAP-GFX900-V4-NEXT: s_trap 2
+;
+; HSA-NOTRAP-GFX900-V3-LABEL: non_entry_trap:
+; HSA-NOTRAP-GFX900-V3: ; %bb.0: ; %entry
+; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT: global_load_dword v1, v0, s[0:1] glc
+; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
+; HSA-NOTRAP-GFX900-V3-NEXT: s_cbranch_vccz .LBB1_2
+; HSA-NOTRAP-GFX900-V3-NEXT: ; %bb.1: ; %ret
+; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 3
+; HSA-NOTRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT: s_endpgm
+; HSA-NOTRAP-GFX900-V3-NEXT: .LBB1_2: ; %trap
+; HSA-NOTRAP-GFX900-V3-NEXT: s_endpgm
+;
+; HSA-NOTRAP-GFX900-V4-LABEL: non_entry_trap:
+; HSA-NOTRAP-GFX900-V4: ; %bb.0: ; %entry
+; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc
+; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
+; HSA-NOTRAP-GFX900-V4-NEXT: s_cbranch_vccz .LBB1_2
+; HSA-NOTRAP-GFX900-V4-NEXT: ; %bb.1: ; %ret
+; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 3
+; HSA-NOTRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT: s_endpgm
+; HSA-NOTRAP-GFX900-V4-NEXT: .LBB1_2: ; %trap
+; HSA-NOTRAP-GFX900-V4-NEXT: s_endpgm
entry:
%tmp29 = load volatile i32, ptr addrspace(1) %arg0
%cmp = icmp eq i32 %tmp29, -1
@@ -143,60 +263,115 @@ ret:
}
define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
-; NOHSA-TRAP-GFX900-LABEL: debugtrap:
-; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
-; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
-; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2
-; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
-; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1]
-; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: s_endpgm
-;
-; HSA-TRAP-GFX803-LABEL: debugtrap:
-; HSA-TRAP-GFX803: ; %bb.0:
-; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2
-; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
-; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
-; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX803-NEXT: s_trap 3
-; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v3
-; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX803-NEXT: s_endpgm
-;
-; HSA-TRAP-GFX900-LABEL: debugtrap:
-; HSA-TRAP-GFX900: ; %bb.0:
-; HSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
-; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
-; HSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2
-; HSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
-; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX900-NEXT: s_trap 3
-; HSA-TRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1]
-; HSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX900-NEXT: s_endpgm
-;
-; HSA-NOTRAP-GFX900-LABEL: debugtrap:
-; HSA-NOTRAP-GFX900: ; %bb.0:
-; HSA-NOTRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
-; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
-; HSA-NOTRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2
-; HSA-NOTRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
-; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1]
-; HSA-NOTRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT: s_endpgm
+; NOHSA-TRAP-GFX900-V3-LABEL: debugtrap:
+; NOHSA-TRAP-GFX900-V3: ; %bb.0:
+; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1
+; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2
+; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v2, s[0:1]
+; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT: s_endpgm
+;
+; NOHSA-TRAP-GFX900-V4-LABEL: debugtrap:
+; NOHSA-TRAP-GFX900-V4: ; %bb.0:
+; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
+; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2
+; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v2, s[0:1]
+; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT: s_endpgm
+;
+; HSA-TRAP-GFX803-V3-LABEL: debugtrap:
+; HSA-TRAP-GFX803-V3: ; %bb.0:
+; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v3, 2
+; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V3-NEXT: flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT: s_trap 3
+; HSA-TRAP-GFX803-V3-NEXT: flat_store_dword v[0:1], v3
+; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT: s_endpgm
+;
+; HSA-TRAP-GFX803-V4-LABEL: debugtrap:
+; HSA-TRAP-GFX803-V4: ; %bb.0:
+; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v3, 2
+; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V4-NEXT: flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT: s_trap 3
+; HSA-TRAP-GFX803-V4-NEXT: flat_store_dword v[0:1], v3
+; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT: s_endpgm
+;
+; HSA-TRAP-GFX900-V3-LABEL: debugtrap:
+; HSA-TRAP-GFX900-V3: ; %bb.0:
+; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2
+; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT: s_trap 3
+; HSA-TRAP-GFX900-V3-NEXT: global_store_dword v0, v2, s[0:1]
+; HSA-TRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT: s_endpgm
+;
+; HSA-TRAP-GFX900-V4-LABEL: debugtrap:
+; HSA-TRAP-GFX900-V4: ; %bb.0:
+; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2
+; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT: s_trap 3
+; HSA-TRAP-GFX900-V4-NEXT: global_store_dword v0, v2, s[0:1]
+; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT: s_endpgm
+;
+; HSA-NOTRAP-GFX900-V3-LABEL: debugtrap:
+; HSA-NOTRAP-GFX900-V3: ; %bb.0:
+; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1
+; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2
+; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT: global_store_dword v0, v2, s[0:1]
+; HSA-NOTRAP-GFX900-V3-NEXT: s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT: s_endpgm
+;
+; HSA-NOTRAP-GFX900-V4-LABEL: debugtrap:
+; HSA-NOTRAP-GFX900-V4: ; %bb.0:
+; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1
+; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2
+; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT: global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT: global_store_dword v0, v2, s[0:1]
+; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT: s_endpgm
store volatile i32 1, ptr addrspace(1) %arg0
call void @llvm.debugtrap()
store volatile i32 2, ptr addrspace(1) %arg0
@@ -207,4 +382,4 @@ attributes #0 = { nounwind noreturn }
attributes #1 = { nounwind }
!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION}
diff --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v3.s
similarity index 94%
rename from llvm/test/MC/AMDGPU/hsa-diag-v4.s
rename to llvm/test/MC/AMDGPU/hsa-diag-v3.s
index f7a554aedb746b0..369ac905ad2b27b 100644
--- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-diag-v3.s
@@ -1,18 +1,18 @@
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX8,PREGFX10,AMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX10,AMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX11,AMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,NONAMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GFX90A,PREGFX10,AMDHSA,ALL
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX8,PREGFX10,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX10,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX11,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,NONAMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GFX90A,PREGFX10,AMDHSA,ALL
.text
// GCN-LABEL: warning: test_target
// GFX8-NOT: error:
-// GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1010:xnack+
-// GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1100
-// NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-unknown--gfx810
+// GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810+xnack does not match the specified target id amdgcn-amd-amdhsa--gfx1010+xnack
+// GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810+xnack does not match the specified target id amdgcn-amd-amdhsa--gfx1100
+// NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810+xnack does not match the specified target id amdgcn-amd-unknown--gfx810
.warning "test_target"
-.amdgcn_target "amdgcn-amd-amdhsa--gfx810:xnack+"
+.amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack"
// GCN-LABEL: warning: test_amdhsa_kernel_no_name
// GCN: error: unknown directive
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s
new file mode 100644
index 000000000000000..ba60000837cdc06
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s
@@ -0,0 +1,226 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 --amdhsa-code-object-version=3 -mattr=+xnack -filetype=obj < %s > %t
+// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// READOBJ: Section Headers
+// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 0000c0 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal
+// READOBJ-NEXT: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete
+// READOBJ-NEXT: 0000000000000200 0 FUNC LOCAL PROTECTED 2 special_sgpr
+// READOBJ-NEXT: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd
+// READOBJ-NEXT: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd
+// READOBJ-NEXT: 0000000000000080 64 OBJECT LOCAL DEFAULT 3 special_sgpr.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac60 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 015001e4 1f0f007f 7f040000 00000000
+// special_sgpr
+// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00b0 00000060 80000000 00000000 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx1010+xnack"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010+xnack"
+
+.p2align 8
+.type minimal, at function
+minimal:
+ s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+ s_endpgm
+
+.p2align 8
+.type special_sgpr, at function
+special_sgpr:
+ s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+ .amdhsa_next_free_vgpr 0
+ .amdhsa_next_free_sgpr 0
+ .amdhsa_shared_vgpr_count 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM: .amdhsa_shared_vgpr_count 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+ .amdhsa_group_segment_fixed_size 1
+ .amdhsa_private_segment_fixed_size 1
+ .amdhsa_kernarg_size 8
+ .amdhsa_user_sgpr_private_segment_buffer 1
+ .amdhsa_user_sgpr_dispatch_ptr 1
+ .amdhsa_user_sgpr_queue_ptr 1
+ .amdhsa_user_sgpr_kernarg_segment_ptr 1
+ .amdhsa_user_sgpr_dispatch_id 1
+ .amdhsa_user_sgpr_flat_scratch_init 1
+ .amdhsa_user_sgpr_private_segment_size 1
+ .amdhsa_wavefront_size32 1
+ .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+ .amdhsa_system_sgpr_workgroup_id_x 0
+ .amdhsa_system_sgpr_workgroup_id_y 1
+ .amdhsa_system_sgpr_workgroup_id_z 1
+ .amdhsa_system_sgpr_workgroup_info 1
+ .amdhsa_system_vgpr_workitem_id 1
+ .amdhsa_next_free_vgpr 9
+ .amdhsa_next_free_sgpr 27
+ .amdhsa_reserve_vcc 0
+ .amdhsa_reserve_flat_scratch 0
+ .amdhsa_reserve_xnack_mask 1
+ .amdhsa_float_round_mode_32 1
+ .amdhsa_float_round_mode_16_64 1
+ .amdhsa_float_denorm_mode_32 1
+ .amdhsa_float_denorm_mode_16_64 0
+ .amdhsa_dx10_clamp 0
+ .amdhsa_ieee_mode 0
+ .amdhsa_fp16_overflow 1
+ .amdhsa_workgroup_processor_mode 1
+ .amdhsa_memory_ordered 1
+ .amdhsa_forward_progress 1
+ .amdhsa_exception_fp_ieee_invalid_op 1
+ .amdhsa_exception_fp_denorm_src 1
+ .amdhsa_exception_fp_ieee_div_zero 1
+ .amdhsa_exception_fp_ieee_overflow 1
+ .amdhsa_exception_fp_ieee_underflow 1
+ .amdhsa_exception_fp_ieee_inexact 1
+ .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 15
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_wavefront_size32 1
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_workgroup_processor_mode 1
+// ASM-NEXT: .amdhsa_memory_ordered 1
+// ASM-NEXT: .amdhsa_forward_progress 1
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+// Test that we are including special SGPR usage in the granulated count.
+.p2align 6
+.amdhsa_kernel special_sgpr
+ // Same next_free_sgpr as "complete", but...
+ .amdhsa_next_free_sgpr 27
+ // ...on GFX10+ this should require an additional 6 SGPRs, pushing us from
+ // 3 granules to 4
+ .amdhsa_reserve_flat_scratch 1
+
+ .amdhsa_reserve_vcc 0
+ .amdhsa_reserve_xnack_mask 1
+
+ .amdhsa_float_denorm_mode_16_64 0
+ .amdhsa_dx10_clamp 0
+ .amdhsa_ieee_mode 0
+ .amdhsa_next_free_vgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel special_sgpr
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s
new file mode 100644
index 000000000000000..7f885b457aa63a3
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s
@@ -0,0 +1,213 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 --amdhsa-code-object-version=3 -filetype=obj < %s > %t
+// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// READOBJ: Section Headers
+// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 0000c0 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal
+// READOBJ-NEXT: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete
+// READOBJ-NEXT: 0000000000000200 0 FUNC LOCAL PROTECTED 2 special_sgpr
+// READOBJ-NEXT: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd
+// READOBJ-NEXT: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd
+// READOBJ-NEXT: 0000000000000080 64 OBJECT LOCAL DEFAULT 3 special_sgpr.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac60 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 015001e4 130f007f 5e040000 00000000
+// special_sgpr
+// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00b0 00000060 80000000 00000000 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx1100"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100"
+
+.p2align 8
+.type minimal, at function
+minimal:
+ s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+ s_endpgm
+
+.p2align 8
+.type special_sgpr, at function
+special_sgpr:
+ s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+ .amdhsa_next_free_vgpr 0
+ .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+ .amdhsa_group_segment_fixed_size 1
+ .amdhsa_private_segment_fixed_size 1
+ .amdhsa_kernarg_size 8
+ .amdhsa_user_sgpr_dispatch_ptr 1
+ .amdhsa_user_sgpr_queue_ptr 1
+ .amdhsa_user_sgpr_kernarg_segment_ptr 1
+ .amdhsa_user_sgpr_dispatch_id 1
+ .amdhsa_user_sgpr_private_segment_size 1
+ .amdhsa_wavefront_size32 1
+ .amdhsa_enable_private_segment 1
+ .amdhsa_system_sgpr_workgroup_id_x 0
+ .amdhsa_system_sgpr_workgroup_id_y 1
+ .amdhsa_system_sgpr_workgroup_id_z 1
+ .amdhsa_system_sgpr_workgroup_info 1
+ .amdhsa_system_vgpr_workitem_id 1
+ .amdhsa_next_free_vgpr 9
+ .amdhsa_next_free_sgpr 27
+ .amdhsa_reserve_vcc 0
+ .amdhsa_float_round_mode_32 1
+ .amdhsa_float_round_mode_16_64 1
+ .amdhsa_float_denorm_mode_32 1
+ .amdhsa_float_denorm_mode_16_64 0
+ .amdhsa_dx10_clamp 0
+ .amdhsa_ieee_mode 0
+ .amdhsa_fp16_overflow 1
+ .amdhsa_workgroup_processor_mode 1
+ .amdhsa_memory_ordered 1
+ .amdhsa_forward_progress 1
+ .amdhsa_exception_fp_ieee_invalid_op 1
+ .amdhsa_exception_fp_denorm_src 1
+ .amdhsa_exception_fp_ieee_div_zero 1
+ .amdhsa_exception_fp_ieee_overflow 1
+ .amdhsa_exception_fp_ieee_underflow 1
+ .amdhsa_exception_fp_ieee_inexact 1
+ .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 9
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_wavefront_size32 1
+// ASM-NEXT: .amdhsa_enable_private_segment 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_workgroup_processor_mode 1
+// ASM-NEXT: .amdhsa_memory_ordered 1
+// ASM-NEXT: .amdhsa_forward_progress 1
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+// Test that we are including special SGPR usage in the granulated count.
+.p2align 6
+.amdhsa_kernel special_sgpr
+ // Same next_free_sgpr as "complete", but...
+ .amdhsa_next_free_sgpr 27
+ // ...on GFX10+ this should require an additional 6 SGPRs, pushing us from
+ // 3 granules to 4
+
+ .amdhsa_reserve_vcc 0
+
+ .amdhsa_float_denorm_mode_16_64 0
+ .amdhsa_dx10_clamp 0
+ .amdhsa_ieee_mode 0
+ .amdhsa_next_free_vgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel special_sgpr
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 11
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
new file mode 100644
index 000000000000000..fd84fab8af81685
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
@@ -0,0 +1,184 @@
+// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=obj < %s > %t
+// RUN: llvm-readobj --elf-output-style=GNU --sections --symbols --relocations %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// READOBJ: Section Headers
+// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 000080 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0070 c1500104 210f007f 7f008100 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc"
+
+.p2align 8
+.type minimal, at function
+minimal:
+ s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+ s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+ .amdhsa_next_free_vgpr 0
+ .amdhsa_next_free_sgpr 0
+ .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM: .amdhsa_tg_split 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+ .amdhsa_group_segment_fixed_size 1
+ .amdhsa_private_segment_fixed_size 1
+ .amdhsa_user_sgpr_private_segment_buffer 1
+ .amdhsa_user_sgpr_dispatch_ptr 1
+ .amdhsa_user_sgpr_queue_ptr 1
+ .amdhsa_user_sgpr_kernarg_segment_ptr 1
+ .amdhsa_user_sgpr_dispatch_id 1
+ .amdhsa_user_sgpr_flat_scratch_init 1
+ .amdhsa_kernarg_size 8
+ .amdhsa_user_sgpr_kernarg_preload_length 1
+ .amdhsa_user_sgpr_kernarg_preload_offset 1
+ .amdhsa_user_sgpr_private_segment_size 1
+ .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+ .amdhsa_system_sgpr_workgroup_id_x 0
+ .amdhsa_system_sgpr_workgroup_id_y 1
+ .amdhsa_system_sgpr_workgroup_id_z 1
+ .amdhsa_system_sgpr_workgroup_info 1
+ .amdhsa_system_vgpr_workitem_id 1
+ .amdhsa_next_free_vgpr 9
+ .amdhsa_next_free_sgpr 27
+ .amdhsa_accum_offset 4
+ .amdhsa_reserve_vcc 0
+ .amdhsa_reserve_flat_scratch 0
+ .amdhsa_float_round_mode_32 1
+ .amdhsa_float_round_mode_16_64 1
+ .amdhsa_float_denorm_mode_32 1
+ .amdhsa_float_denorm_mode_16_64 0
+ .amdhsa_dx10_clamp 0
+ .amdhsa_ieee_mode 0
+ .amdhsa_fp16_overflow 1
+ .amdhsa_tg_split 1
+ .amdhsa_exception_fp_ieee_invalid_op 1
+ .amdhsa_exception_fp_denorm_src 1
+ .amdhsa_exception_fp_ieee_div_zero 1
+ .amdhsa_exception_fp_ieee_overflow 1
+ .amdhsa_exception_fp_ieee_underflow 1
+ .amdhsa_exception_fp_ieee_inexact 1
+ .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 16
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_tg_split 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 9
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s
new file mode 100644
index 000000000000000..9624515ecd6fb90
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s
@@ -0,0 +1,178 @@
+// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s > %t
+// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// READOBJ: Section Headers
+// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 000080 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0070 01510104 150f007f 5e008100 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx940+xnack+sram-ecc"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx940+xnack+sram-ecc"
+
+.p2align 8
+.type minimal, at function
+minimal:
+ s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+ s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+ .amdhsa_next_free_vgpr 0
+ .amdhsa_next_free_sgpr 0
+ .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM: .amdhsa_tg_split 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+ .amdhsa_group_segment_fixed_size 1
+ .amdhsa_private_segment_fixed_size 1
+ .amdhsa_user_sgpr_dispatch_ptr 1
+ .amdhsa_user_sgpr_queue_ptr 1
+ .amdhsa_user_sgpr_kernarg_segment_ptr 1
+ .amdhsa_user_sgpr_dispatch_id 1
+ .amdhsa_kernarg_size 8
+ .amdhsa_user_sgpr_kernarg_preload_length 1
+ .amdhsa_user_sgpr_kernarg_preload_offset 1
+ .amdhsa_user_sgpr_private_segment_size 1
+ .amdhsa_enable_private_segment 1
+ .amdhsa_system_sgpr_workgroup_id_x 0
+ .amdhsa_system_sgpr_workgroup_id_y 1
+ .amdhsa_system_sgpr_workgroup_id_z 1
+ .amdhsa_system_sgpr_workgroup_info 1
+ .amdhsa_system_vgpr_workitem_id 1
+ .amdhsa_next_free_vgpr 9
+ .amdhsa_next_free_sgpr 27
+ .amdhsa_accum_offset 4
+ .amdhsa_reserve_vcc 0
+ .amdhsa_float_round_mode_32 1
+ .amdhsa_float_round_mode_16_64 1
+ .amdhsa_float_denorm_mode_32 1
+ .amdhsa_float_denorm_mode_16_64 0
+ .amdhsa_dx10_clamp 0
+ .amdhsa_ieee_mode 0
+ .amdhsa_fp16_overflow 1
+ .amdhsa_tg_split 1
+ .amdhsa_exception_fp_ieee_invalid_op 1
+ .amdhsa_exception_fp_denorm_src 1
+ .amdhsa_exception_fp_ieee_div_zero 1
+ .amdhsa_exception_fp_ieee_overflow 1
+ .amdhsa_exception_fp_ieee_underflow 1
+ .amdhsa_exception_fp_ieee_inexact 1
+ .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 10
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_enable_private_segment 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_tg_split 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 9
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4
diff --git a/llvm/test/MC/AMDGPU/hsa-v3.s b/llvm/test/MC/AMDGPU/hsa-v3.s
new file mode 100644
index 000000000000000..9f854986d7bc447
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-v3.s
@@ -0,0 +1,304 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=3 -mattr=+xnack -filetype=obj < %s > %t
+// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// READOBJ: Section Headers
+// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 000100 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210
+// READOBJ: 00000000000000d0 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 310
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal
+// READOBJ-NEXT: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete
+// READOBJ-NEXT: 0000000000000200 0 FUNC LOCAL PROTECTED 2 special_sgpr
+// READOBJ-NEXT: 0000000000000300 0 FUNC LOCAL PROTECTED 2 disabled_user_sgpr
+// READOBJ-NEXT: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd
+// READOBJ-NEXT: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd
+// READOBJ-NEXT: 0000000000000080 64 OBJECT LOCAL DEFAULT 3 special_sgpr.kd
+// READOBJ-NEXT: 00000000000000c0 64 OBJECT LOCAL DEFAULT 3 disabled_user_sgpr.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f000000 00000000
+// special_sgpr
+// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00b0 00010000 80000000 00000000 00000000
+// disabled_user_sgpr
+// OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00f0 0000ac00 80000000 00000000 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack"
+
+.p2align 8
+.type minimal, at function
+minimal:
+ s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+ s_endpgm
+
+.p2align 8
+.type special_sgpr, at function
+special_sgpr:
+ s_endpgm
+
+.p2align 8
+.type disabled_user_sgpr, at function
+disabled_user_sgpr:
+ s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+ .amdhsa_next_free_vgpr 0
+ .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+ .amdhsa_group_segment_fixed_size 1
+ .amdhsa_private_segment_fixed_size 1
+ .amdhsa_kernarg_size 8
+ .amdhsa_user_sgpr_private_segment_buffer 1
+ .amdhsa_user_sgpr_dispatch_ptr 1
+ .amdhsa_user_sgpr_queue_ptr 1
+ .amdhsa_user_sgpr_kernarg_segment_ptr 1
+ .amdhsa_user_sgpr_dispatch_id 1
+ .amdhsa_user_sgpr_flat_scratch_init 1
+ .amdhsa_user_sgpr_private_segment_size 1
+ .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+ .amdhsa_system_sgpr_workgroup_id_x 0
+ .amdhsa_system_sgpr_workgroup_id_y 1
+ .amdhsa_system_sgpr_workgroup_id_z 1
+ .amdhsa_system_sgpr_workgroup_info 1
+ .amdhsa_system_vgpr_workitem_id 1
+ .amdhsa_next_free_vgpr 9
+ .amdhsa_next_free_sgpr 27
+ .amdhsa_reserve_vcc 0
+ .amdhsa_reserve_flat_scratch 0
+ .amdhsa_reserve_xnack_mask 1
+ .amdhsa_float_round_mode_32 1
+ .amdhsa_float_round_mode_16_64 1
+ .amdhsa_float_denorm_mode_32 1
+ .amdhsa_float_denorm_mode_16_64 0
+ .amdhsa_dx10_clamp 0
+ .amdhsa_ieee_mode 0
+ .amdhsa_fp16_overflow 1
+ .amdhsa_exception_fp_ieee_invalid_op 1
+ .amdhsa_exception_fp_denorm_src 1
+ .amdhsa_exception_fp_ieee_div_zero 1
+ .amdhsa_exception_fp_ieee_overflow 1
+ .amdhsa_exception_fp_ieee_underflow 1
+ .amdhsa_exception_fp_ieee_inexact 1
+ .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 15
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+// Test that we are including special SGPR usage in the granulated count.
+.p2align 6
+.amdhsa_kernel special_sgpr
+ // Same next_free_sgpr as "complete", but...
+ .amdhsa_next_free_sgpr 27
+ // ...on GFX9 this should require an additional 6 SGPRs, pushing us from
+ // 3 granules to 4
+ .amdhsa_reserve_flat_scratch 1
+
+ .amdhsa_reserve_vcc 0
+ .amdhsa_reserve_xnack_mask 1
+
+ .amdhsa_float_denorm_mode_16_64 0
+ .amdhsa_dx10_clamp 0
+ .amdhsa_ieee_mode 0
+ .amdhsa_next_free_vgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel special_sgpr
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM: .end_amdhsa_kernel
+
+// Test that explicitly disabling user_sgpr's does not affect the user_sgpr
+// count, i.e. this should produce the same descriptor as minimal.
+.p2align 6
+.amdhsa_kernel disabled_user_sgpr
+ .amdhsa_user_sgpr_private_segment_buffer 0
+ .amdhsa_next_free_vgpr 0
+ .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel disabled_user_sgpr
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 9
+
+.byte .amdgcn.gfx_generation_minor
+// ASM: .byte 0
+
+.byte .amdgcn.gfx_generation_stepping
+// ASM: .byte 4
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4
+
+// Metadata
+
+.amdgpu_metadata
+ amdhsa.version:
+ - 3
+ - 0
+ amdhsa.kernels:
+ - .name: amd_kernel_code_t_test_all
+ .symbol: amd_kernel_code_t_test_all at kd
+ .kernarg_segment_size: 8
+ .group_segment_fixed_size: 16
+ .private_segment_fixed_size: 32
+ .kernarg_segment_align: 64
+ .wavefront_size: 128
+ .sgpr_count: 14
+ .vgpr_count: 40
+ .max_flat_workgroup_size: 256
+ - .name: amd_kernel_code_t_minimal
+ .symbol: amd_kernel_code_t_minimal at kd
+ .kernarg_segment_size: 8
+ .group_segment_fixed_size: 16
+ .private_segment_fixed_size: 32
+ .kernarg_segment_align: 64
+ .wavefront_size: 128
+ .sgpr_count: 14
+ .vgpr_count: 40
+ .max_flat_workgroup_size: 256
+.end_amdgpu_metadata
+
+// ASM: .amdgpu_metadata
+// ASM: amdhsa.kernels:
+// ASM: - .group_segment_fixed_size: 16
+// ASM: .kernarg_segment_align: 64
+// ASM: .kernarg_segment_size: 8
+// ASM: .max_flat_workgroup_size: 256
+// ASM: .name: amd_kernel_code_t_test_all
+// ASM: .private_segment_fixed_size: 32
+// ASM: .sgpr_count: 14
+// ASM: .symbol: 'amd_kernel_code_t_test_all at kd'
+// ASM: .vgpr_count: 40
+// ASM: .wavefront_size: 128
+// ASM: - .group_segment_fixed_size: 16
+// ASM: .kernarg_segment_align: 64
+// ASM: .kernarg_segment_size: 8
+// ASM: .max_flat_workgroup_size: 256
+// ASM: .name: amd_kernel_code_t_minimal
+// ASM: .private_segment_fixed_size: 32
+// ASM: .sgpr_count: 14
+// ASM: .symbol: 'amd_kernel_code_t_minimal at kd'
+// ASM: .vgpr_count: 40
+// ASM: .wavefront_size: 128
+// ASM: amdhsa.version:
+// ASM-NEXT: - 3
+// ASM-NEXT: - 0
+// ASM: .end_amdgpu_metadata
diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
index 7e3ae8424cc7bdc..63e532e0ffa3768 100644
--- a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
+++ b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a %s 2>&1 >/dev/null | FileCheck -check-prefix=ERR %s
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a %s 2>&1 >/dev/null | FileCheck -check-prefix=ERR %s
.amdhsa_kernel implied_count_too_low_0
.amdhsa_user_sgpr_count 0
diff --git a/llvm/test/MC/AMDGPU/user-sgpr-count.s b/llvm/test/MC/AMDGPU/user-sgpr-count.s
index 950c514f786b254..aa8970185eb04bc 100644
--- a/llvm/test/MC/AMDGPU/user-sgpr-count.s
+++ b/llvm/test/MC/AMDGPU/user-sgpr-count.s
@@ -1,10 +1,10 @@
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a --amdhsa-code-object-version=4 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
.text
// ASM: .text
-.amdgcn_target "amdgcn-amd-amdhsa--gfx90a:xnack+"
-// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a:xnack+"
+.amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc"
// ASM-LABEL: .amdhsa_kernel user_sgprs_implied_count
More information about the llvm-commits
mailing list