[lld] 868abf0 - Revert "[AMDGPU] Remove Code Object V3 (#67118)"

via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 18 03:57:42 PDT 2023


Author: pvanhout
Date: 2023-10-18T12:55:36+02:00
New Revision: 868abf09619cb10ba710162483e5a66b0c1e4446

URL: https://github.com/llvm/llvm-project/commit/868abf09619cb10ba710162483e5a66b0c1e4446
DIFF: https://github.com/llvm/llvm-project/commit/868abf09619cb10ba710162483e5a66b0c1e4446.diff

LOG: Revert "[AMDGPU] Remove Code Object V3 (#67118)"

This reverts commit 544d91280c26fd5f7acd70eac4d667863562f4cc.

Added: 
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
    llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
    llvm/test/MC/AMDGPU/hsa-diag-v3.s
    llvm/test/MC/AMDGPU/hsa-gfx10-v3.s
    llvm/test/MC/AMDGPU/hsa-gfx11-v3.s
    llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
    llvm/test/MC/AMDGPU/hsa-gfx940-v3.s
    llvm/test/MC/AMDGPU/hsa-v3.s

Modified: 
    clang/include/clang/Basic/TargetOptions.h
    clang/include/clang/Driver/Options.td
    clang/lib/Driver/ToolChains/CommonArgs.cpp
    clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
    clang/test/Driver/hip-code-object-version.hip
    clang/test/Driver/hip-device-libs.hip
    lld/test/ELF/amdgpu-abi-version.s
    llvm/docs/AMDGPUUsage.rst
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
    llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
    llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
    llvm/test/CodeGen/AMDGPU/kernarg-size.ll
    llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
    llvm/test/CodeGen/AMDGPU/trap-abis.ll
    llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
    llvm/test/MC/AMDGPU/user-sgpr-count.s

Removed: 
    llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
    llvm/test/MC/AMDGPU/hsa-diag-v4.s


################################################################################
diff  --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h
index ba3acd029587160..8bb03249b7f8308 100644
--- a/clang/include/clang/Basic/TargetOptions.h
+++ b/clang/include/clang/Basic/TargetOptions.h
@@ -83,7 +83,7 @@ class TargetOptions {
   enum CodeObjectVersionKind {
     COV_None,
     COV_2 = 200, // Unsupported.
-    COV_3 = 300, // Unsupported.
+    COV_3 = 300,
     COV_4 = 400,
     COV_5 = 500,
   };

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a89d6b6579f1176..640044622fc09ee 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4682,9 +4682,9 @@ defm amdgpu_ieee : BoolOption<"m", "amdgpu-ieee",
 def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>,
   HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">,
   Visibility<[ClangOption, CC1Option]>,
-  Values<"none,4,5">,
+  Values<"none,3,4,5">,
   NormalizedValuesScope<"TargetOptions">,
-  NormalizedValues<["COV_None", "COV_4", "COV_5"]>,
+  NormalizedValues<["COV_None", "COV_3", "COV_4", "COV_5"]>,
   MarshallingInfoEnum<TargetOpts<"CodeObjectVersion">, "COV_4">;
 
 defm cumode : SimpleMFlag<"cumode",

diff  --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index f104ec5a881cb96..25fd940584624ee 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2338,7 +2338,7 @@ getAMDGPUCodeObjectArgument(const Driver &D, const llvm::opt::ArgList &Args) {
 
 void tools::checkAMDGPUCodeObjectVersion(const Driver &D,
                                          const llvm::opt::ArgList &Args) {
-  const unsigned MinCodeObjVer = 4;
+  const unsigned MinCodeObjVer = 3;
   const unsigned MaxCodeObjVer = 5;
 
   if (auto *CodeObjArg = getAMDGPUCodeObjectArgument(D, Args)) {

diff  --git a/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu b/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
index ff5deaf9ab850d2..0ddd63faf46f28f 100644
--- a/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
@@ -3,6 +3,9 @@
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 // RUN:   -o - %s | FileCheck %s -check-prefix=V4
 
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
+// RUN:   -mcode-object-version=3 -o - %s | FileCheck -check-prefix=V3 %s
+
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 // RUN:   -mcode-object-version=4 -o - %s | FileCheck -check-prefix=V4 %s
 
@@ -15,6 +18,7 @@
 // RUN: not %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 // RUN:   -mcode-object-version=4.1 -o - %s 2>&1| FileCheck %s -check-prefix=INV
 
+// V3: !{{.*}} = !{i32 1, !"amdgpu_code_object_version", i32 300}
 // V4: !{{.*}} = !{i32 1, !"amdgpu_code_object_version", i32 400}
 // V5: !{{.*}} = !{i32 1, !"amdgpu_code_object_version", i32 500}
 // NONE-NOT: !{{.*}} = !{i32 1, !"amdgpu_code_object_version",

diff  --git a/clang/test/Driver/hip-code-object-version.hip b/clang/test/Driver/hip-code-object-version.hip
index af5f9a3da21dfd3..33559b6576e7d30 100644
--- a/clang/test/Driver/hip-code-object-version.hip
+++ b/clang/test/Driver/hip-code-object-version.hip
@@ -1,5 +1,20 @@
 // REQUIRES: amdgpu-registered-target
 
+// Check bundle ID for code object v3.
+
+// RUN: not %clang -### --target=x86_64-linux-gnu \
+// RUN:   -mcode-object-version=3 \
+// RUN:   --offload-arch=gfx906 --rocm-path=%S/Inputs/rocm \
+// RUN:   %s 2>&1 | FileCheck -check-prefix=V3 %s
+
+// RUN: not %clang -### --target=x86_64-linux-gnu \
+// RUN:   -mcode-object-version=4 -mcode-object-version=3 \
+// RUN:   --offload-arch=gfx906 --rocm-path=%S/Inputs/rocm \
+// RUN:   %s 2>&1 | FileCheck -check-prefix=V3 %s
+
+// V3: "-mcode-object-version=3"
+// V3: "-mllvm" "--amdhsa-code-object-version=3"
+// V3: "-targets=host-x86_64-unknown-linux,hip-amdgcn-amd-amdhsa--gfx906"
 
 // Check bundle ID for code object version 4.
 
@@ -47,13 +62,6 @@
 // INVALID_2: error: invalid integral value '2' in '-mcode-object-version=2'
 // INVALID_2-NOT: error: invalid integral value
 
-// RUN: not %clang -### --target=x86_64-linux-gnu \
-// RUN:   -mcode-object-version=3 \
-// RUN:   --offload-arch=gfx906 --rocm-path=%S/Inputs/rocm \
-// RUN:   %s 2>&1 | FileCheck -check-prefix=INVALID_3 %s
-// INVALID_3: error: invalid integral value '3' in '-mcode-object-version=3'
-// INVALID_3-NOT: error: invalid integral value
-
 // Check LLVM code object version option --amdhsa-code-object-version
 // is passed to -cc1 and -cc1as, and -mcode-object-version is passed
 // to -cc1 but not -cc1as.

diff  --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip
index 6ac5778721ba5b7..71d9554da696b42 100644
--- a/clang/test/Driver/hip-device-libs.hip
+++ b/clang/test/Driver/hip-device-libs.hip
@@ -168,6 +168,12 @@
 // RUN:   --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=NOABI4
 
+// Test -mcode-object-version=3
+// RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \
+// RUN:   -mcode-object-version=3 \
+// RUN:   --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s --check-prefixes=ABI4
+
 // Test -mcode-object-version=4
 // RUN: %clang -### --target=x86_64-linux-gnu --offload-arch=gfx900 \
 // RUN:   -mcode-object-version=4 \

diff  --git a/lld/test/ELF/amdgpu-abi-version.s b/lld/test/ELF/amdgpu-abi-version.s
index cda9f5aafa5ee2f..455a52aec921092 100644
--- a/lld/test/ELF/amdgpu-abi-version.s
+++ b/lld/test/ELF/amdgpu-abi-version.s
@@ -1,4 +1,11 @@
 # REQUIRES: amdgpu
+# RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -filetype=obj %s -o %t.o
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-readobj --file-headers %t.so | FileCheck --check-prefix=COV3 %s
+
+# COV3: OS/ABI: AMDGPU_HSA (0x40)
+# COV3: ABIVersion: 1
+
 # RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 -filetype=obj %s -o %t.o
 # RUN: ld.lld -shared %t.o -o %t.so
 # RUN: llvm-readobj --file-headers %t.so | FileCheck --check-prefix=COV4 %s

diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index ed9581ccc93dfac..8022816d7e616d3 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1409,10 +1409,12 @@ The AMDGPU backend uses the following ELF header:
   object conforms:
 
   * ``ELFABIVERSION_AMDGPU_HSA_V2`` is used to specify the version of AMD HSA
-    runtime ABI for code object V2. Can no longer be emitted by this version of LLVM.
+    runtime ABI for code object V2. Specify using the Clang option
+    ``-mcode-object-version=2``.
 
   * ``ELFABIVERSION_AMDGPU_HSA_V3`` is used to specify the version of AMD HSA
-    runtime ABI for code object V3. Can no longer be emitted by this version of LLVM.
+    runtime ABI for code object V3. Specify using the Clang option
+    ``-mcode-object-version=3``.
 
   * ``ELFABIVERSION_AMDGPU_HSA_V4`` is used to specify the version of AMD HSA
     runtime ABI for code object V4. Specify using the Clang option
@@ -3400,7 +3402,8 @@ Code Object V3 Metadata
 +++++++++++++++++++++++
 
 .. warning::
-  Code object V3 generation is no longer supported by this version of LLVM.
+  Code object V3 is not the default code object version emitted by this version
+  of LLVM.
 
 Code object V3 and above metadata is specified by the ``NT_AMDGPU_METADATA`` note
 record (see :ref:`amdgpu-note-records-v3-onwards`).

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 8d0ef67a615dfc6..aadc4a68ea13278 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -341,6 +341,9 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
 
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
     switch (CodeObjectVersion) {
+    case AMDGPU::AMDHSA_COV3:
+      HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3());
+      break;
     case AMDGPU::AMDHSA_COV4:
       HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
       break;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b51a876750b58b0..5060cd3aec581ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -49,14 +49,14 @@ namespace AMDGPU {
 namespace HSAMD {
 
 //===----------------------------------------------------------------------===//
-// HSAMetadataStreamerV4
+// HSAMetadataStreamerV3
 //===----------------------------------------------------------------------===//
 
-void MetadataStreamerMsgPackV4::dump(StringRef HSAMetadataString) const {
+void MetadataStreamerMsgPackV3::dump(StringRef HSAMetadataString) const {
   errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
 }
 
-void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const {
+void MetadataStreamerMsgPackV3::verify(StringRef HSAMetadataString) const {
   errs() << "AMDGPU HSA Metadata Parser Test: ";
 
   msgpack::Document FromHSAMetadataString;
@@ -78,7 +78,7 @@ void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const {
 }
 
 std::optional<StringRef>
-MetadataStreamerMsgPackV4::getAccessQualifier(StringRef AccQual) const {
+MetadataStreamerMsgPackV3::getAccessQualifier(StringRef AccQual) const {
   return StringSwitch<std::optional<StringRef>>(AccQual)
       .Case("read_only", StringRef("read_only"))
       .Case("write_only", StringRef("write_only"))
@@ -86,7 +86,7 @@ MetadataStreamerMsgPackV4::getAccessQualifier(StringRef AccQual) const {
       .Default(std::nullopt);
 }
 
-std::optional<StringRef> MetadataStreamerMsgPackV4::getAddressSpaceQualifier(
+std::optional<StringRef> MetadataStreamerMsgPackV3::getAddressSpaceQualifier(
     unsigned AddressSpace) const {
   switch (AddressSpace) {
   case AMDGPUAS::PRIVATE_ADDRESS:
@@ -107,7 +107,7 @@ std::optional<StringRef> MetadataStreamerMsgPackV4::getAddressSpaceQualifier(
 }
 
 StringRef
-MetadataStreamerMsgPackV4::getValueKind(Type *Ty, StringRef TypeQual,
+MetadataStreamerMsgPackV3::getValueKind(Type *Ty, StringRef TypeQual,
                                         StringRef BaseTypeName) const {
   if (TypeQual.contains("pipe"))
     return "pipe";
@@ -134,7 +134,7 @@ MetadataStreamerMsgPackV4::getValueKind(Type *Ty, StringRef TypeQual,
                    : "by_value");
 }
 
-std::string MetadataStreamerMsgPackV4::getTypeName(Type *Ty,
+std::string MetadataStreamerMsgPackV3::getTypeName(Type *Ty,
                                                    bool Signed) const {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID: {
@@ -173,7 +173,7 @@ std::string MetadataStreamerMsgPackV4::getTypeName(Type *Ty,
 }
 
 msgpack::ArrayDocNode
-MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const {
+MetadataStreamerMsgPackV3::getWorkGroupDimensions(MDNode *Node) const {
   auto Dims = HSAMetadataDoc->getArrayNode();
   if (Node->getNumOperands() != 3)
     return Dims;
@@ -184,20 +184,14 @@ MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const {
   return Dims;
 }
 
-void MetadataStreamerMsgPackV4::emitVersion() {
+void MetadataStreamerMsgPackV3::emitVersion() {
   auto Version = HSAMetadataDoc->getArrayNode();
-  Version.push_back(Version.getDocument()->getNode(VersionMajorV4));
-  Version.push_back(Version.getDocument()->getNode(VersionMinorV4));
+  Version.push_back(Version.getDocument()->getNode(VersionMajorV3));
+  Version.push_back(Version.getDocument()->getNode(VersionMinorV3));
   getRootMetadata("amdhsa.version") = Version;
 }
 
-void MetadataStreamerMsgPackV4::emitTargetID(
-    const IsaInfo::AMDGPUTargetID &TargetID) {
-  getRootMetadata("amdhsa.target") =
-      HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true);
-}
-
-void MetadataStreamerMsgPackV4::emitPrintf(const Module &Mod) {
+void MetadataStreamerMsgPackV3::emitPrintf(const Module &Mod) {
   auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
   if (!Node)
     return;
@@ -210,7 +204,7 @@ void MetadataStreamerMsgPackV4::emitPrintf(const Module &Mod) {
   getRootMetadata("amdhsa.printf") = Printf;
 }
 
-void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func,
+void MetadataStreamerMsgPackV3::emitKernelLanguage(const Function &Func,
                                                    msgpack::MapDocNode Kern) {
   // TODO: What about other languages?
   auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
@@ -229,7 +223,7 @@ void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func,
   Kern[".language_version"] = LanguageVersion;
 }
 
-void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func,
+void MetadataStreamerMsgPackV3::emitKernelAttrs(const Function &Func,
                                                 msgpack::MapDocNode Kern) {
 
   if (auto Node = Func.getMetadata("reqd_work_group_size"))
@@ -254,7 +248,7 @@ void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func,
     Kern[".kind"] = Kern.getDocument()->getNode("fini");
 }
 
-void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
+void MetadataStreamerMsgPackV3::emitKernelArgs(const MachineFunction &MF,
                                                msgpack::MapDocNode Kern) {
   auto &Func = MF.getFunction();
   unsigned Offset = 0;
@@ -267,7 +261,7 @@ void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
   Kern[".args"] = Args;
 }
 
-void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
+void MetadataStreamerMsgPackV3::emitKernelArg(const Argument &Arg,
                                               unsigned &Offset,
                                               msgpack::ArrayDocNode Args) {
   auto Func = Arg.getParent();
@@ -332,7 +326,7 @@ void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
                 AccQual, TypeQual);
 }
 
-void MetadataStreamerMsgPackV4::emitKernelArg(
+void MetadataStreamerMsgPackV3::emitKernelArg(
     const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind,
     unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign,
     StringRef Name, StringRef TypeName, StringRef BaseTypeName,
@@ -381,7 +375,7 @@ void MetadataStreamerMsgPackV4::emitKernelArg(
   Args.push_back(Arg);
 }
 
-void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
+void MetadataStreamerMsgPackV3::emitHiddenKernelArgs(
     const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) {
   auto &Func = MF.getFunction();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -454,10 +448,9 @@ void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
   }
 }
 
-msgpack::MapDocNode
-MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
-                                             const SIProgramInfo &ProgramInfo,
-                                             unsigned CodeObjectVersion) const {
+msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps(
+    const MachineFunction &MF, const SIProgramInfo &ProgramInfo,
+    unsigned CodeObjectVersion) const {
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   const Function &F = MF.getFunction();
@@ -502,19 +495,18 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
   return Kern;
 }
 
-bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+bool MetadataStreamerMsgPackV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
   return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
 }
 
-void MetadataStreamerMsgPackV4::begin(const Module &Mod,
+void MetadataStreamerMsgPackV3::begin(const Module &Mod,
                                       const IsaInfo::AMDGPUTargetID &TargetID) {
   emitVersion();
-  emitTargetID(TargetID);
   emitPrintf(Mod);
   getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
 }
 
-void MetadataStreamerMsgPackV4::end() {
+void MetadataStreamerMsgPackV3::end() {
   std::string HSAMetadataString;
   raw_string_ostream StrOS(HSAMetadataString);
   HSAMetadataDoc->toYAML(StrOS);
@@ -525,7 +517,7 @@ void MetadataStreamerMsgPackV4::end() {
     verify(StrOS.str());
 }
 
-void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,
+void MetadataStreamerMsgPackV3::emitKernel(const MachineFunction &MF,
                                            const SIProgramInfo &ProgramInfo) {
   auto &Func = MF.getFunction();
   if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
@@ -550,6 +542,31 @@ void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,
   Kernels.push_back(Kern);
 }
 
+//===----------------------------------------------------------------------===//
+// HSAMetadataStreamerV4
+//===----------------------------------------------------------------------===//
+
+void MetadataStreamerMsgPackV4::emitVersion() {
+  auto Version = HSAMetadataDoc->getArrayNode();
+  Version.push_back(Version.getDocument()->getNode(VersionMajorV4));
+  Version.push_back(Version.getDocument()->getNode(VersionMinorV4));
+  getRootMetadata("amdhsa.version") = Version;
+}
+
+void MetadataStreamerMsgPackV4::emitTargetID(
+    const IsaInfo::AMDGPUTargetID &TargetID) {
+  getRootMetadata("amdhsa.target") =
+      HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true);
+}
+
+void MetadataStreamerMsgPackV4::begin(const Module &Mod,
+                                      const IsaInfo::AMDGPUTargetID &TargetID) {
+  emitVersion();
+  emitTargetID(TargetID);
+  emitPrintf(Mod);
+  getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
+}
+
 //===----------------------------------------------------------------------===//
 // HSAMetadataStreamerV5
 //===----------------------------------------------------------------------===//
@@ -663,7 +680,7 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
 
 void MetadataStreamerMsgPackV5::emitKernelAttrs(const Function &Func,
                                                 msgpack::MapDocNode Kern) {
-  MetadataStreamerMsgPackV4::emitKernelAttrs(Func, Kern);
+  MetadataStreamerMsgPackV3::emitKernelAttrs(Func, Kern);
 
   if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool())
     Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 18a7b5d7a9633e8..d2b3b8917ce0f70 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -62,7 +62,7 @@ class MetadataStreamer {
                                msgpack::MapDocNode Kern) = 0;
 };
 
-class MetadataStreamerMsgPackV4 : public MetadataStreamer {
+class MetadataStreamerMsgPackV3 : public MetadataStreamer {
 protected:
   std::unique_ptr<msgpack::Document> HSAMetadataDoc =
       std::make_unique<msgpack::Document>();
@@ -89,8 +89,6 @@ class MetadataStreamerMsgPackV4 : public MetadataStreamer {
 
   void emitVersion() override;
 
-  void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);
-
   void emitPrintf(const Module &Mod);
 
   void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern);
@@ -122,8 +120,8 @@ class MetadataStreamerMsgPackV4 : public MetadataStreamer {
   }
 
 public:
-  MetadataStreamerMsgPackV4() = default;
-  ~MetadataStreamerMsgPackV4() = default;
+  MetadataStreamerMsgPackV3() = default;
+  ~MetadataStreamerMsgPackV3() = default;
 
   bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
 
@@ -136,6 +134,19 @@ class MetadataStreamerMsgPackV4 : public MetadataStreamer {
                   const SIProgramInfo &ProgramInfo) override;
 };
 
+class MetadataStreamerMsgPackV4 : public MetadataStreamerMsgPackV3 {
+protected:
+  void emitVersion() override;
+  void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);
+
+public:
+  MetadataStreamerMsgPackV4() = default;
+  ~MetadataStreamerMsgPackV4() = default;
+
+  void begin(const Module &Mod,
+             const IsaInfo::AMDGPUTargetID &TargetID) override;
+};
+
 class MetadataStreamerMsgPackV5 final : public MetadataStreamerMsgPackV4 {
 protected:
   void emitVersion() override;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 21abfb42d11ba21..3d70ed150df12f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6478,6 +6478,11 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
     return legalizeTrapEndpgm(MI, MRI, B);
 
+  const Module *M = B.getMF().getFunction().getParent();
+  unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
+  if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
+    return legalizeTrapHsaQueuePtr(MI, MRI, B);
+
   return ST.supportsGetDoorbellID() ?
          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
 }

diff  --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 42af09e27e471e8..6b8c03c1620d26b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -424,6 +424,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
   switch (CodeObjectVersion) {
   default:
     break;
+  case AMDGPU::AMDHSA_COV3:
   case AMDGPU::AMDHSA_COV4:
   case AMDGPU::AMDHSA_COV5:
     if (getTargetID()->isXnackSupported())

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9c5b166c9652238..f8f1e6d6c9097cc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5992,6 +5992,11 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
       Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
     return lowerTrapEndpgm(Op, DAG);
 
+  const Module *M = DAG.getMachineFunction().getFunction().getParent();
+  unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
+  if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
+    return lowerTrapHsaQueuePtr(Op, DAG);
+
   return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
          lowerTrapHsaQueuePtr(Op, DAG);
 }

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5fff19eada75dd8..d123b384a27d4cc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -128,6 +128,8 @@ std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
     return std::nullopt;
 
   switch (AmdhsaCodeObjectVersion) {
+  case 3:
+    return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
   case 4:
     return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
   case 5:
@@ -138,6 +140,12 @@ std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
   }
 }
 
+bool isHsaAbiVersion3(const MCSubtargetInfo *STI) {
+  if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
+    return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
+  return false;
+}
+
 bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
   if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
     return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4;
@@ -166,6 +174,7 @@ unsigned getCodeObjectVersion(const Module &M) {
 
 unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) {
   switch (CodeObjectVersion) {
+  case AMDHSA_COV3:
   case AMDHSA_COV4:
     return 48;
   case AMDHSA_COV5:
@@ -179,6 +188,7 @@ unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) {
 // central TD file.
 unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) {
   switch (CodeObjectVersion) {
+  case AMDHSA_COV3:
   case AMDHSA_COV4:
     return 24;
   case AMDHSA_COV5:
@@ -189,6 +199,7 @@ unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) {
 
 unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) {
   switch (CodeObjectVersion) {
+  case AMDHSA_COV3:
   case AMDHSA_COV4:
     return 32;
   case AMDHSA_COV5:
@@ -199,6 +210,7 @@ unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) {
 
 unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) {
   switch (CodeObjectVersion) {
+  case AMDHSA_COV3:
   case AMDHSA_COV4:
     return 40;
   case AMDHSA_COV5:
@@ -762,6 +774,15 @@ std::string AMDGPUTargetID::toString() const {
   std::string Features;
   if (STI.getTargetTriple().getOS() == Triple::AMDHSA) {
     switch (CodeObjectVersion) {
+    case AMDGPU::AMDHSA_COV3:
+      // xnack.
+      if (isXnackOnOrAny())
+        Features += "+xnack";
+      // In code object v2 and v3, "sramecc" feature was spelled with a
+      // hyphen ("sram-ecc").
+      if (isSramEccOnOrAny())
+        Features += "+sram-ecc";
+      break;
     case AMDGPU::AMDHSA_COV4:
     case AMDGPU::AMDHSA_COV5:
       // sramecc.

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1e0994d0862cf5d..bb2964f592f66bf 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -42,12 +42,19 @@ namespace AMDGPU {
 
 struct IsaVersion;
 
-enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5 };
+enum {
+  AMDHSA_COV3 = 3,
+  AMDHSA_COV4 = 4,
+  AMDHSA_COV5 = 5
+};
 
 /// \returns True if \p STI is AMDHSA.
 bool isHsaAbi(const MCSubtargetInfo &STI);
 /// \returns HSA OS ABI Version identification.
 std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI);
+/// \returns True if HSA OS ABI Version identification is 3,
+/// false otherwise.
+bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
 /// \returns True if HSA OS ABI Version identification is 4,
 /// false otherwise.
 bool isHsaAbiVersion4(const MCSubtargetInfo *STI);

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 4bdbe6604782a8b..c25ecafa1f7c074 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -1,11 +1,38 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V3 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s
 
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V3 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
 
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
+; GFX8V3-LABEL: addrspacecast:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
+; GFX8V3-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_mov_b32 s4, s0
+; GFX8V3-NEXT:    s_mov_b32 s5, s3
+; GFX8V3-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX8V3-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
+; GFX8V3-NEXT:    s_mov_b32 s6, s1
+; GFX8V3-NEXT:    s_mov_b32 s7, s2
+; GFX8V3-NEXT:    s_cmp_lg_u32 s1, -1
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8V3-NEXT:    s_cselect_b64 s[0:1], s[6:7], 0
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8V3-NEXT:    flat_store_dword v[0:1], v2
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT:    v_mov_b32_e32 v2, 2
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8V3-NEXT:    flat_store_dword v[0:1], v2
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    s_endpgm
+;
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -55,6 +82,30 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    s_endpgm
 ;
+; GFX9V3-LABEL: addrspacecast:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V3-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9V3-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V3-NEXT:    v_mov_b32_e32 v2, 1
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT:    s_mov_b32 s2, s0
+; GFX9V3-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9V3-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9V3-NEXT:    s_mov_b32 s4, s1
+; GFX9V3-NEXT:    s_cmp_lg_u32 s1, -1
+; GFX9V3-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9V3-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V3-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9V3-NEXT:    flat_store_dword v[0:1], v2
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9V3-NEXT:    v_mov_b32_e32 v2, 2
+; GFX9V3-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9V3-NEXT:    flat_store_dword v[0:1], v2
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    s_endpgm
+;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -110,6 +161,19 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 }
 
 define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
+; GFX8V3-LABEL: llvm_amdgcn_is_shared:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_load_dword s0, s[4:5], 0x40
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V3-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT:    flat_store_dword v[0:1], v0
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    s_endpgm
+;
 ; GFX8V4-LABEL: llvm_amdgcn_is_shared:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -136,6 +200,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    s_endpgm
 ;
+; GFX9V3-LABEL: llvm_amdgcn_is_shared:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V3-NEXT:    s_mov_b64 s[2:3], src_shared_base
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT:    s_cmp_eq_u32 s1, s3
+; GFX9V3-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9V3-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    s_endpgm
+;
 ; GFX9V4-LABEL: llvm_amdgcn_is_shared:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -166,6 +242,19 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 }
 
 define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
+; GFX8V3-LABEL: llvm_amdgcn_is_private:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V3-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT:    flat_store_dword v[0:1], v0
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    s_endpgm
+;
 ; GFX8V4-LABEL: llvm_amdgcn_is_private:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -192,6 +281,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    s_endpgm
 ;
+; GFX9V3-LABEL: llvm_amdgcn_is_private:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V3-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT:    s_cmp_eq_u32 s1, s3
+; GFX9V3-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9V3-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    s_endpgm
+;
 ; GFX9V4-LABEL: llvm_amdgcn_is_private:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -222,6 +323,11 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 }
 
 define amdgpu_kernel void @llvm_trap() {
+; GFX8V3-LABEL: llvm_trap:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GFX8V3-NEXT:    s_trap 2
+;
 ; GFX8V4-LABEL: llvm_trap:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_mov_b64 s[0:1], s[4:5]
@@ -233,6 +339,11 @@ define amdgpu_kernel void @llvm_trap() {
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_trap 2
 ;
+; GFX9V3-LABEL: llvm_trap:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GFX9V3-NEXT:    s_trap 2
+;
 ; GFX9V4-LABEL: llvm_trap:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_trap 2
@@ -245,6 +356,10 @@ define amdgpu_kernel void @llvm_trap() {
 }
 
 define amdgpu_kernel void @llvm_debugtrap() {
+; GFX8V3-LABEL: llvm_debugtrap:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    s_trap 3
+;
 ; GFX8V4-LABEL: llvm_debugtrap:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_trap 3
@@ -253,6 +368,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_trap 3
 ;
+; GFX9V3-LABEL: llvm_debugtrap:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    s_trap 3
+;
 ; GFX9V4-LABEL: llvm_debugtrap:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_trap 3
@@ -265,6 +384,32 @@ define amdgpu_kernel void @llvm_debugtrap() {
 }
 
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
+; GFX8V3-LABEL: llvm_amdgcn_queue_ptr:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8V3-NEXT:    s_add_u32 s0, s8, 8
+; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT:    s_addc_u32 s1, s9, 0
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8V3-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8V3-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    s_endpgm
+;
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
@@ -315,6 +460,23 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    s_endpgm
 ;
+; GFX9V3-LABEL: llvm_amdgcn_queue_ptr:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[6:7] glc
+; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[8:9] offset:8 glc
+; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
+; GFX9V3-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9V3-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9V3-NEXT:    ; kill: killed $sgpr6_sgpr7
+; GFX9V3-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    s_endpgm
+;
 ; GFX9V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
new file mode 100644
index 000000000000000..20d0aea61f27688
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-v3.ll
@@ -0,0 +1,148 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -amdgpu-verify-hsa-metadata -filetype=obj -o /dev/null < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+; CHECK-LABEL: {{^}}min_64_max_64:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_64_max_64() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
+
+; CHECK-LABEL: {{^}}min_64_max_128:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_64_max_128() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
+
+; CHECK-LABEL: {{^}}min_128_max_128:
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 0
+; CHECK: NumSGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 1
+define amdgpu_kernel void @min_128_max_128() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
+
+; CHECK-LABEL: {{^}}min_1024_max_1024
+; CHECK: SGPRBlocks: 0
+; CHECK: VGPRBlocks: 10
+; CHECK: NumSGPRsForWavesPerEU: 2{{$}}
+; CHECK: NumVGPRsForWavesPerEU: 43
+ at var = addrspace(1) global float 0.0
+define amdgpu_kernel void @min_1024_max_1024() #3 {
+  %val0 = load volatile float, ptr addrspace(1) @var
+  %val1 = load volatile float, ptr addrspace(1) @var
+  %val2 = load volatile float, ptr addrspace(1) @var
+  %val3 = load volatile float, ptr addrspace(1) @var
+  %val4 = load volatile float, ptr addrspace(1) @var
+  %val5 = load volatile float, ptr addrspace(1) @var
+  %val6 = load volatile float, ptr addrspace(1) @var
+  %val7 = load volatile float, ptr addrspace(1) @var
+  %val8 = load volatile float, ptr addrspace(1) @var
+  %val9 = load volatile float, ptr addrspace(1) @var
+  %val10 = load volatile float, ptr addrspace(1) @var
+  %val11 = load volatile float, ptr addrspace(1) @var
+  %val12 = load volatile float, ptr addrspace(1) @var
+  %val13 = load volatile float, ptr addrspace(1) @var
+  %val14 = load volatile float, ptr addrspace(1) @var
+  %val15 = load volatile float, ptr addrspace(1) @var
+  %val16 = load volatile float, ptr addrspace(1) @var
+  %val17 = load volatile float, ptr addrspace(1) @var
+  %val18 = load volatile float, ptr addrspace(1) @var
+  %val19 = load volatile float, ptr addrspace(1) @var
+  %val20 = load volatile float, ptr addrspace(1) @var
+  %val21 = load volatile float, ptr addrspace(1) @var
+  %val22 = load volatile float, ptr addrspace(1) @var
+  %val23 = load volatile float, ptr addrspace(1) @var
+  %val24 = load volatile float, ptr addrspace(1) @var
+  %val25 = load volatile float, ptr addrspace(1) @var
+  %val26 = load volatile float, ptr addrspace(1) @var
+  %val27 = load volatile float, ptr addrspace(1) @var
+  %val28 = load volatile float, ptr addrspace(1) @var
+  %val29 = load volatile float, ptr addrspace(1) @var
+  %val30 = load volatile float, ptr addrspace(1) @var
+  %val31 = load volatile float, ptr addrspace(1) @var
+  %val32 = load volatile float, ptr addrspace(1) @var
+  %val33 = load volatile float, ptr addrspace(1) @var
+  %val34 = load volatile float, ptr addrspace(1) @var
+  %val35 = load volatile float, ptr addrspace(1) @var
+  %val36 = load volatile float, ptr addrspace(1) @var
+  %val37 = load volatile float, ptr addrspace(1) @var
+  %val38 = load volatile float, ptr addrspace(1) @var
+  %val39 = load volatile float, ptr addrspace(1) @var
+  %val40 = load volatile float, ptr addrspace(1) @var
+
+  store volatile float %val0, ptr addrspace(1) @var
+  store volatile float %val1, ptr addrspace(1) @var
+  store volatile float %val2, ptr addrspace(1) @var
+  store volatile float %val3, ptr addrspace(1) @var
+  store volatile float %val4, ptr addrspace(1) @var
+  store volatile float %val5, ptr addrspace(1) @var
+  store volatile float %val6, ptr addrspace(1) @var
+  store volatile float %val7, ptr addrspace(1) @var
+  store volatile float %val8, ptr addrspace(1) @var
+  store volatile float %val9, ptr addrspace(1) @var
+  store volatile float %val10, ptr addrspace(1) @var
+  store volatile float %val11, ptr addrspace(1) @var
+  store volatile float %val12, ptr addrspace(1) @var
+  store volatile float %val13, ptr addrspace(1) @var
+  store volatile float %val14, ptr addrspace(1) @var
+  store volatile float %val15, ptr addrspace(1) @var
+  store volatile float %val16, ptr addrspace(1) @var
+  store volatile float %val17, ptr addrspace(1) @var
+  store volatile float %val18, ptr addrspace(1) @var
+  store volatile float %val19, ptr addrspace(1) @var
+  store volatile float %val20, ptr addrspace(1) @var
+  store volatile float %val21, ptr addrspace(1) @var
+  store volatile float %val22, ptr addrspace(1) @var
+  store volatile float %val23, ptr addrspace(1) @var
+  store volatile float %val24, ptr addrspace(1) @var
+  store volatile float %val25, ptr addrspace(1) @var
+  store volatile float %val26, ptr addrspace(1) @var
+  store volatile float %val27, ptr addrspace(1) @var
+  store volatile float %val28, ptr addrspace(1) @var
+  store volatile float %val29, ptr addrspace(1) @var
+  store volatile float %val30, ptr addrspace(1) @var
+  store volatile float %val31, ptr addrspace(1) @var
+  store volatile float %val32, ptr addrspace(1) @var
+  store volatile float %val33, ptr addrspace(1) @var
+  store volatile float %val34, ptr addrspace(1) @var
+  store volatile float %val35, ptr addrspace(1) @var
+  store volatile float %val36, ptr addrspace(1) @var
+  store volatile float %val37, ptr addrspace(1) @var
+  store volatile float %val38, ptr addrspace(1) @var
+  store volatile float %val39, ptr addrspace(1) @var
+  store volatile float %val40, ptr addrspace(1) @var
+
+  ret void
+}
+attributes #3 = {"amdgpu-flat-work-group-size"="1024,1024"}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
+
+; CHECK: amdhsa.kernels:
+; CHECK:   .max_flat_workgroup_size: 64
+; CHECK:   .name:                 min_64_max_64
+; CHECK:   .max_flat_workgroup_size: 128
+; CHECK:   .name:                 min_64_max_128
+; CHECK:   .max_flat_workgroup_size: 128
+; CHECK:   .name:                 min_128_max_128
+; CHECK:   .max_flat_workgroup_size: 1024
+; CHECK:   .name:                 min_1024_max_1024
+; CHECK: amdhsa.version:
+; CHECK:   - 1
+; CHECK:   - 0
+
+; PARSER: AMDGPU HSA Metadata Parser Test: PASS

diff  --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll
new file mode 100644
index 000000000000000..6c553e3726abf90
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target-v3.ll
@@ -0,0 +1,168 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=V3-GFX600 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck --check-prefixes=V3-GFX600 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx601 < %s | FileCheck --check-prefixes=V3-GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=pitcairn < %s | FileCheck --check-prefixes=V3-GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=verde < %s | FileCheck --check-prefixes=V3-GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx602 < %s | FileCheck --check-prefixes=V3-GFX602 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hainan < %s | FileCheck --check-prefixes=V3-GFX602 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=oland < %s | FileCheck --check-prefixes=V3-GFX602 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=V3-GFX700 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=V3-GFX700 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx701 < %s | FileCheck --check-prefixes=V3-GFX701 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck --check-prefixes=V3-GFX701 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx702 < %s | FileCheck --check-prefixes=V3-GFX702 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 < %s | FileCheck --check-prefixes=V3-GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kabini < %s | FileCheck --check-prefixes=V3-GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=mullins < %s | FileCheck --check-prefixes=V3-GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 < %s | FileCheck --check-prefixes=V3-GFX704 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck --check-prefixes=V3-GFX704 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx705 < %s | FileCheck --check-prefixes=V3-GFX705 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX801-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX801-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX801-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 < %s | FileCheck --check-prefixes=V3-GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland < %s | FileCheck --check-prefixes=V3-GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck --check-prefixes=V3-GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefixes=V3-GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck --check-prefixes=V3-GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris10 < %s | FileCheck --check-prefixes=V3-GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris11 < %s | FileCheck --check-prefixes=V3-GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx805 < %s | FileCheck --check-prefixes=V3-GFX805 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tongapro < %s | FileCheck --check-prefixes=V3-GFX805 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX810-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX810-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX810-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=V3-GFX900-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX900-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX900-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 < %s | FileCheck --check-prefixes=V3-GFX902-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX902-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX902-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 < %s | FileCheck --check-prefixes=V3-GFX904-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX904-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX904-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-sramecc < %s | FileCheck --check-prefixes=V3-GFX906-NOSRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX906-NOSRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=-sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX906-NOSRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX906-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-sramecc < %s | FileCheck --check-prefixes=V3-GFX908-NOSRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+sramecc < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX908-NOSRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+sramecc,-xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=-sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX908-NOSRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -mattr=+sramecc,+xnack < %s | FileCheck --check-prefixes=V3-GFX908-SRAMECC-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 < %s | FileCheck --check-prefixes=V3-GFX909-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX909-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX909-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c < %s | FileCheck --check-prefixes=V3-GFX90C-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX90C-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX90C-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefixes=V3-GFX940-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX940-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX940-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=V3-GFX1010-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1010-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1010-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 < %s | FileCheck --check-prefixes=V3-GFX1011-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1011-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1011-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 < %s | FileCheck --check-prefixes=V3-GFX1012-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1012-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1012-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 < %s | FileCheck --check-prefixes=V3-GFX1013-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 -mattr=-xnack < %s | FileCheck --check-prefixes=V3-GFX1013-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1013 -mattr=+xnack < %s | FileCheck --check-prefixes=V3-GFX1013-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck --check-prefixes=V3-GFX1030 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck --check-prefixes=V3-GFX1031 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1032 < %s | FileCheck --check-prefixes=V3-GFX1032 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1033 < %s | FileCheck --check-prefixes=V3-GFX1033 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1034 < %s | FileCheck --check-prefixes=V3-GFX1034 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1035 < %s | FileCheck --check-prefixes=V3-GFX1035 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1036 < %s | FileCheck --check-prefixes=V3-GFX1036 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=V3-GFX1100 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 < %s | FileCheck --check-prefixes=V3-GFX1101 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck --check-prefixes=V3-GFX1102 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1103 < %s | FileCheck --check-prefixes=V3-GFX1103 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1150 < %s | FileCheck --check-prefixes=V3-GFX1150 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1151 < %s | FileCheck --check-prefixes=V3-GFX1151 %s
+
+; V3-GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600"
+; V3-GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601"
+; V3-GFX602: .amdgcn_target "amdgcn-amd-amdhsa--gfx602"
+; V3-GFX700: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
+; V3-GFX701: .amdgcn_target "amdgcn-amd-amdhsa--gfx701"
+; V3-GFX702: .amdgcn_target "amdgcn-amd-amdhsa--gfx702"
+; V3-GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703"
+; V3-GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704"
+; V3-GFX705: .amdgcn_target "amdgcn-amd-amdhsa--gfx705"
+; V3-GFX801-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx801"
+; V3-GFX801-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack"
+; V3-GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802"
+; V3-GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803"
+; V3-GFX805: .amdgcn_target "amdgcn-amd-amdhsa--gfx805"
+; V3-GFX810-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx810"
+; V3-GFX810-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack"
+; V3-GFX900-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx900"
+; V3-GFX900-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack"
+; V3-GFX902-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx902"
+; V3-GFX902-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack"
+; V3-GFX904-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx904"
+; V3-GFX904-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack"
+; V3-GFX906-NOSRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906"
+; V3-GFX906-SRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+sram-ecc"
+; V3-GFX906-NOSRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack"
+; V3-GFX906-SRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sram-ecc"
+; V3-GFX908-NOSRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908"
+; V3-GFX908-SRAMECC-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+sram-ecc"
+; V3-GFX908-NOSRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+xnack"
+; V3-GFX908-SRAMECC-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx908+xnack+sram-ecc"
+; V3-GFX909-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx909"
+; V3-GFX909-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx909+xnack"
+; V3-GFX90C-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx90c"
+; V3-GFX90C-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx90c+xnack"
+; V3-GFX940-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx940+sram-ecc"
+; V3-GFX940-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx940+xnack+sram-ecc"
+; V3-GFX1010-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010"
+; V3-GFX1010-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010+xnack"
+; V3-GFX1011-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1011"
+; V3-GFX1011-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1011+xnack"
+; V3-GFX1012-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1012"
+; V3-GFX1012-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1012+xnack"
+; V3-GFX1013-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1013"
+; V3-GFX1013-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1013+xnack"
+; V3-GFX1030: .amdgcn_target "amdgcn-amd-amdhsa--gfx1030"
+; V3-GFX1031: .amdgcn_target "amdgcn-amd-amdhsa--gfx1031"
+; V3-GFX1032: .amdgcn_target "amdgcn-amd-amdhsa--gfx1032"
+; V3-GFX1033: .amdgcn_target "amdgcn-amd-amdhsa--gfx1033"
+; V3-GFX1034: .amdgcn_target "amdgcn-amd-amdhsa--gfx1034"
+; V3-GFX1035: .amdgcn_target "amdgcn-amd-amdhsa--gfx1035"
+; V3-GFX1036: .amdgcn_target "amdgcn-amd-amdhsa--gfx1036"
+; V3-GFX1100: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100"
+; V3-GFX1101: .amdgcn_target "amdgcn-amd-amdhsa--gfx1101"
+; V3-GFX1102: .amdgcn_target "amdgcn-amd-amdhsa--gfx1102"
+; V3-GFX1103: .amdgcn_target "amdgcn-amd-amdhsa--gfx1103"
+; V3-GFX1150: .amdgcn_target "amdgcn-amd-amdhsa--gfx1150"
+; V3-GFX1151: .amdgcn_target "amdgcn-amd-amdhsa--gfx1151"
+
+
+
+define amdgpu_kernel void @directive_amdgcn_target() {
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll
similarity index 98%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll
index 042abe382283a65..37b124e7f59a09f 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll
@@ -142,7 +142,7 @@ define amdgpu_kernel void @test_no_default_queue(i8 %a) #3
 
 ; CHECK:  amdhsa.version:
 ; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
 ; CHECK-NOT:  amdhsa.printf:
 
 attributes #0 = { optnone noinline "amdgpu-no-default-queue" "amdgpu-no-completion-action" "amdgpu-implicitarg-num-bytes"="48" }
@@ -151,7 +151,7 @@ attributes #2 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-implici
 attributes #3 = { optnone noinline "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="48" }
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
 
 !1 = !{i32 0}
 !2 = !{!"none"}

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
index fb08fd2c45085a1..8e8023aa16f1303 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll
@@ -39,4 +39,4 @@ define internal void @bar.5() {
 ; PARSER: AMDGPU HSA Metadata Parser Test: PASS
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
index dc3a6e8b633b233..69efc47008e6aad 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
@@ -1739,14 +1739,14 @@ define amdgpu_kernel void @unknown_addrspace_kernarg(ptr addrspace(12345) %ptr)
 ; CHECK-NEXT: - '2:1:8:%g\n'
 ; CHECK:  amdhsa.version:
 ; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
 
 attributes #0 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" }
 attributes #1 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
 attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
 
 !llvm.printf.fmts = !{!100, !101}
 

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
index f4892ebdc9c9374..47b882494c9191a 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v4.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll
@@ -286,7 +286,7 @@ entry:
 
 ; CHECK:  amdhsa.version:
 ; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
 
 ; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to
 ; avoid optimizing out the implicit argument allocation.
@@ -298,4 +298,4 @@ attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" }
 attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll
similarity index 96%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll
index 22c6e147762200d..cb3ae289721bc17 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-asan.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3-asan.ll
@@ -35,12 +35,12 @@ define amdgpu_kernel void @test_kernel(i8 %a) #0
 
 ; CHECK:  amdhsa.version:
 ; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
 
 attributes #0 = { sanitize_address "amdgpu-implicitarg-num-bytes"="48" }
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
 !1 = !{i32 0}
 !2 = !{!"none"}
 !3 = !{!"char"}

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll
index 8f90025fe8e29c1..a3f8c5cff95df85 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v4.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-v3.ll
@@ -296,4 +296,4 @@ attributes #3 = { "amdgpu-implicitarg-num-bytes"="48" "amdgpu-no-hostcall-ptr" }
 attributes #4 = { noinline }
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
similarity index 98%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
index 6d49f22eb429b24..b7f58bbb51bb29d 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-images.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-images-v3.ll
@@ -96,10 +96,10 @@ define amdgpu_kernel void @test(ptr addrspace(1) %a,
 
 ; CHECK:  amdhsa.version:
 ; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}
 
 !1 = !{!"image1d_t", !"image1d_array_t", !"image1d_buffer_t",
        !"image2d_t", !"image2d_array_t", !"image2d_array_depth_t",

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
similarity index 80%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
index fc5e6e273125334..8117037baaffc9d 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-1-v3.ll
@@ -5,9 +5,9 @@
 ; CHECK: ---
 ; CHECK: amdhsa.version:
 ; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
 ; CHECK: ...
 
 !opencl.ocl.version = !{}
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
similarity index 81%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
index 1ec79c95bc2a308..ea744863a9b8875 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-invalid-ocl-version-3-v3.ll
@@ -5,10 +5,10 @@
 ; CHECK: ---
 ; CHECK: amdhsa.version:
 ; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
 ; CHECK: ...
 
 !opencl.ocl.version = !{!0}
 !llvm.module.flags = !{!1}
 !0 = !{i32 1}
-!1 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!1 = !{i32 1, !"amdgpu_code_object_version", i32 300}

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
similarity index 99%
rename from llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
rename to llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index e45c4d1786faf67..d6f7a92af9dcb6f 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -158,11 +158,11 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 {
 
 ; CHECK:  amdhsa.version:
 ; CHECK-NEXT: - 1
-; CHECK-NEXT: - 1
+; CHECK-NEXT: - 0
 
 attributes #0 = { "amdgpu-num-sgpr"="14" }
 attributes #1 = { "amdgpu-num-vgpr"="20" }
 attributes #2 = { "amdgpu-flat-work-group-size"="1,256" }
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}

diff  --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 9e6c0ef86906ddb..9760e93eb48e6bd 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -1,11 +1,36 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V3 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 | FileCheck --check-prefix=GFX8V5 %s
 
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V3 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s
 
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
+; GFX8V3-LABEL: addrspacecast:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V3-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
+; GFX8V3-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX8V3-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX8V3-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX8V3-NEXT:    s_cmp_lg_u32 s1, -1
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8V3-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX8V3-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX8V3-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8V3-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8V3-NEXT:    flat_store_dword v[0:1], v4
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, 2
+; GFX8V3-NEXT:    flat_store_dword v[2:3], v0
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    s_endpgm
+;
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -52,6 +77,30 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    s_endpgm
 ;
+; GFX9V3-LABEL: addrspacecast:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V3-NEXT:    s_mov_b64 s[2:3], src_private_base
+; GFX9V3-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V3-NEXT:    v_mov_b32_e32 v4, 1
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT:    s_cmp_lg_u32 s0, -1
+; GFX9V3-NEXT:    s_cselect_b32 s2, s3, 0
+; GFX9V3-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9V3-NEXT:    s_cmp_lg_u32 s1, -1
+; GFX9V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9V3-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9V3-NEXT:    s_cselect_b32 s0, s5, 0
+; GFX9V3-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V3-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9V3-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9V3-NEXT:    flat_store_dword v[0:1], v4
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9V3-NEXT:    flat_store_dword v[2:3], v0
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    s_endpgm
+;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -107,6 +156,18 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 }
 
 define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
+; GFX8V3-LABEL: llvm_amdgcn_is_shared:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    s_load_dword s0, s[4:5], 0x40
+; GFX8V3-NEXT:    s_load_dword s1, s[6:7], 0x4
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V3-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX8V3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V3-NEXT:    flat_store_dword v[0:1], v0
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    s_endpgm
+;
 ; GFX8V4-LABEL: llvm_amdgcn_is_shared:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dword s0, s[4:5], 0x40
@@ -131,6 +192,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    s_endpgm
 ;
+; GFX9V3-LABEL: llvm_amdgcn_is_shared:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    s_load_dword s2, s[4:5], 0x4
+; GFX9V3-NEXT:    s_mov_b64 s[0:1], src_shared_base
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9V3-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9V3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V3-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    s_endpgm
+;
 ; GFX9V4-LABEL: llvm_amdgcn_is_shared:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dword s2, s[4:5], 0x4
@@ -161,6 +234,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 }
 
 define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
+; GFX8V3-LABEL: llvm_amdgcn_is_private:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    s_load_dword s0, s[4:5], 0x44
+; GFX8V3-NEXT:    s_load_dword s1, s[6:7], 0x4
+; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8V3-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX8V3-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX8V3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8V3-NEXT:    flat_store_dword v[0:1], v0
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    s_endpgm
+;
 ; GFX8V4-LABEL: llvm_amdgcn_is_private:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dword s0, s[4:5], 0x44
@@ -185,6 +270,18 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    s_endpgm
 ;
+; GFX9V3-LABEL: llvm_amdgcn_is_private:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    s_load_dword s2, s[4:5], 0x4
+; GFX9V3-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT:    s_cmp_eq_u32 s2, s1
+; GFX9V3-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9V3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9V3-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    s_endpgm
+;
 ; GFX9V4-LABEL: llvm_amdgcn_is_private:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dword s2, s[4:5], 0x4
@@ -215,6 +312,11 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 }
 
 define amdgpu_kernel void @llvm_trap() {
+; GFX8V3-LABEL: llvm_trap:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GFX8V3-NEXT:    s_trap 2
+;
 ; GFX8V4-LABEL: llvm_trap:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_mov_b64 s[0:1], s[4:5]
@@ -226,6 +328,11 @@ define amdgpu_kernel void @llvm_trap() {
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_trap 2
 ;
+; GFX9V3-LABEL: llvm_trap:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GFX9V3-NEXT:    s_trap 2
+;
 ; GFX9V4-LABEL: llvm_trap:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_trap 2
@@ -238,6 +345,10 @@ define amdgpu_kernel void @llvm_trap() {
 }
 
 define amdgpu_kernel void @llvm_debugtrap() {
+; GFX8V3-LABEL: llvm_debugtrap:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    s_trap 3
+;
 ; GFX8V4-LABEL: llvm_debugtrap:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_trap 3
@@ -246,6 +357,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_trap 3
 ;
+; GFX9V3-LABEL: llvm_debugtrap:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    s_trap 3
+;
 ; GFX9V4-LABEL: llvm_debugtrap:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_trap 3
@@ -258,6 +373,31 @@ define amdgpu_kernel void @llvm_debugtrap() {
 }
 
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
+; GFX8V3-LABEL: llvm_amdgcn_queue_ptr:
+; GFX8V3:       ; %bb.0:
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8V3-NEXT:    s_add_u32 s0, s8, 8
+; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT:    s_addc_u32 s1, s9, 0
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8V3-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GFX8V3-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V3-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8V3-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8V3-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX8V3-NEXT:    s_endpgm
+;
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
@@ -306,6 +446,23 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    s_endpgm
 ;
+; GFX9V3-LABEL: llvm_amdgcn_queue_ptr:
+; GFX9V3:       ; %bb.0:
+; GFX9V3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[6:7] glc
+; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[8:9] offset:8 glc
+; GFX9V3-NEXT:    global_load_ubyte v0, v2, s[4:5] glc
+; GFX9V3-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9V3-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9V3-NEXT:    ; kill: killed $sgpr6_sgpr7
+; GFX9V3-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9V3-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9V3-NEXT:    s_endpgm
+;
 ; GFX9V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index a04fe28dbffffab..0353e7ee49ab953 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -1,8 +1,17 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=DOORBELL %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=DOORBELL %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA %s
 
 declare void @llvm.trap() #0
 
+; HSA:      .amdhsa_kernel trap
+; HSA-NEXT:     .amdhsa_group_segment_fixed_size 0
+; HSA-NEXT:     .amdhsa_private_segment_fixed_size 0
+; HSA-NEXT:     .amdhsa_kernarg_size 8
+; HSA-NEXT:     .amdhsa_user_sgpr_count 8
+; HSA-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
+; HSA:      .end_amdhsa_kernel
+
 ; DOORBELL:      .amdhsa_kernel trap
 ; DOORBELL-NEXT:     .amdhsa_group_segment_fixed_size 0
 ; DOORBELL-NEXT:     .amdhsa_private_segment_fixed_size 0

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 9ed896c148e6485..792ec2675247f61 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -317,4 +317,4 @@ attributes #1 = { nounwind "stackrealign" }
 attributes #2 = { nounwind alignstack=128 }
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 300}

diff  --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 54a15513cf0a506..03ea582698486ce 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -1,54 +1,101 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc %s -o - -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=NOHSA-TRAP-GFX900 %s
-; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803 %s
-; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900 %s
-; RUN: llc %s -o - -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=NOHSA-TRAP-GFX900-V3 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=NOHSA-TRAP-GFX900-V4 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803-V3 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX803-V4 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900-V3 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck --check-prefix=HSA-TRAP-GFX900-V4 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/300/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900-V3 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-trap-handler -verify-machineinstrs | FileCheck --check-prefix=HSA-NOTRAP-GFX900-V4 %s
 
 declare void @llvm.trap() #0
 declare void @llvm.debugtrap() #1
 
 define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
-; NOHSA-TRAP-GFX900-LABEL: trap:
-; NOHSA-TRAP-GFX900:       ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOHSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; NOHSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v1, 1
-; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
-; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT:    s_endpgm
-;
-; HSA-TRAP-GFX803-LABEL: trap:
-; HSA-TRAP-GFX803:       ; %bb.0:
-; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s2
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s3
-; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
-; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX803-NEXT:    s_trap 2
-;
-; HSA-TRAP-GFX900-LABEL: trap:
-; HSA-TRAP-GFX900:       ; %bb.0:
-; HSA-TRAP-GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; HSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v1, 1
-; HSA-TRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
-; HSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX900-NEXT:    s_trap 2
-;
-; HSA-NOTRAP-GFX900-LABEL: trap:
-; HSA-NOTRAP-GFX900:       ; %bb.0:
-; HSA-NOTRAP-GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-NOTRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; HSA-NOTRAP-GFX900-NEXT:    v_mov_b32_e32 v1, 1
-; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
-; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT:    s_endpgm
+; NOHSA-TRAP-GFX900-V3-LABEL: trap:
+; NOHSA-TRAP-GFX900-V3:       ; %bb.0:
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 1
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT:    global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_endpgm
+;
+; NOHSA-TRAP-GFX900-V4-LABEL: trap:
+; NOHSA-TRAP-GFX900-V4:       ; %bb.0:
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 1
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT:    global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX803-V3-LABEL: trap:
+; HSA-TRAP-GFX803-V3:       ; %bb.0:
+; HSA-TRAP-GFX803-V3-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-V3-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX803-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v0, s2
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v1, s3
+; HSA-TRAP-GFX803-V3-NEXT:    flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT:    s_trap 2
+;
+; HSA-TRAP-GFX803-V4-LABEL: trap:
+; HSA-TRAP-GFX803-V4:       ; %bb.0:
+; HSA-TRAP-GFX803-V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-V4-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX803-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v0, s2
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v1, s3
+; HSA-TRAP-GFX803-V4-NEXT:    flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT:    s_trap 2
+;
+; HSA-TRAP-GFX900-V3-LABEL: trap:
+; HSA-TRAP-GFX900-V3:       ; %bb.0:
+; HSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
+; HSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX900-V3-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT:    global_store_dword v0, v1, s[2:3]
+; HSA-TRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT:    s_trap 2
+;
+; HSA-TRAP-GFX900-V4-LABEL: trap:
+; HSA-TRAP-GFX900-V4:       ; %bb.0:
+; HSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-TRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT:    s_trap 2
+;
+; HSA-NOTRAP-GFX900-V3-LABEL: trap:
+; HSA-NOTRAP-GFX900-V3:       ; %bb.0:
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-NOTRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_endpgm
+;
+; HSA-NOTRAP-GFX900-V4-LABEL: trap:
+; HSA-NOTRAP-GFX900-V4:       ; %bb.0:
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-NOTRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_endpgm
   store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.trap()
   unreachable
@@ -57,77 +104,150 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 }
 
 define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
-; NOHSA-TRAP-GFX900-LABEL: non_entry_trap:
-; NOHSA-TRAP-GFX900:       ; %bb.0: ; %entry
-; NOHSA-TRAP-GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOHSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT:    global_load_dword v1, v0, s[0:1] glc
-; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v1
-; NOHSA-TRAP-GFX900-NEXT:    s_cbranch_vccz .LBB1_2
-; NOHSA-TRAP-GFX900-NEXT:  ; %bb.1: ; %ret
-; NOHSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v1, 3
-; NOHSA-TRAP-GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
-; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT:    s_endpgm
-; NOHSA-TRAP-GFX900-NEXT:  .LBB1_2: ; %trap
-; NOHSA-TRAP-GFX900-NEXT:    s_endpgm
-;
-; HSA-TRAP-GFX803-LABEL: non_entry_trap:
-; HSA-TRAP-GFX803:       ; %bb.0: ; %entry
-; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
-; HSA-TRAP-GFX803-NEXT:    flat_load_dword v0, v[0:1] glc
-; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX803-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v0
-; HSA-TRAP-GFX803-NEXT:    s_cbranch_vccz .LBB1_2
-; HSA-TRAP-GFX803-NEXT:  ; %bb.1: ; %ret
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 3
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
-; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
-; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX803-NEXT:    s_endpgm
-; HSA-TRAP-GFX803-NEXT:  .LBB1_2: ; %trap
-; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; HSA-TRAP-GFX803-NEXT:    s_trap 2
-;
-; HSA-TRAP-GFX900-LABEL: non_entry_trap:
-; HSA-TRAP-GFX900:       ; %bb.0: ; %entry
-; HSA-TRAP-GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; HSA-TRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX900-NEXT:    global_load_dword v1, v0, s[0:1] glc
-; HSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v1
-; HSA-TRAP-GFX900-NEXT:    s_cbranch_vccz .LBB1_2
-; HSA-TRAP-GFX900-NEXT:  ; %bb.1: ; %ret
-; HSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v1, 3
-; HSA-TRAP-GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
-; HSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX900-NEXT:    s_endpgm
-; HSA-TRAP-GFX900-NEXT:  .LBB1_2: ; %trap
-; HSA-TRAP-GFX900-NEXT:    s_trap 2
-;
-; HSA-NOTRAP-GFX900-LABEL: non_entry_trap:
-; HSA-NOTRAP-GFX900:       ; %bb.0: ; %entry
-; HSA-NOTRAP-GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-NOTRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT:    global_load_dword v1, v0, s[0:1] glc
-; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v1
-; HSA-NOTRAP-GFX900-NEXT:    s_cbranch_vccz .LBB1_2
-; HSA-NOTRAP-GFX900-NEXT:  ; %bb.1: ; %ret
-; HSA-NOTRAP-GFX900-NEXT:    v_mov_b32_e32 v1, 3
-; HSA-NOTRAP-GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
-; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT:    s_endpgm
-; HSA-NOTRAP-GFX900-NEXT:  .LBB1_2: ; %trap
-; HSA-NOTRAP-GFX900-NEXT:    s_endpgm
+; NOHSA-TRAP-GFX900-V3-LABEL: non_entry_trap:
+; NOHSA-TRAP-GFX900-V3:       ; %bb.0: ; %entry
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v1
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_cbranch_vccz .LBB1_2
+; NOHSA-TRAP-GFX900-V3-NEXT:  ; %bb.1: ; %ret
+; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 3
+; NOHSA-TRAP-GFX900-V3-NEXT:    global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_endpgm
+; NOHSA-TRAP-GFX900-V3-NEXT:  .LBB1_2: ; %trap
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_endpgm
+;
+; NOHSA-TRAP-GFX900-V4-LABEL: non_entry_trap:
+; NOHSA-TRAP-GFX900-V4:       ; %bb.0: ; %entry
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v1
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_cbranch_vccz .LBB1_2
+; NOHSA-TRAP-GFX900-V4-NEXT:  ; %bb.1: ; %ret
+; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 3
+; NOHSA-TRAP-GFX900-V4-NEXT:    global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_endpgm
+; NOHSA-TRAP-GFX900-V4-NEXT:  .LBB1_2: ; %trap
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX803-V3-LABEL: non_entry_trap:
+; HSA-TRAP-GFX803-V3:       ; %bb.0: ; %entry
+; HSA-TRAP-GFX803-V3-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX803-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V3-NEXT:    flat_load_dword v0, v[0:1] glc
+; HSA-TRAP-GFX803-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v0
+; HSA-TRAP-GFX803-V3-NEXT:    s_cbranch_vccz .LBB1_2
+; HSA-TRAP-GFX803-V3-NEXT:  ; %bb.1: ; %ret
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v2, 3
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V3-NEXT:    flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT:    s_endpgm
+; HSA-TRAP-GFX803-V3-NEXT:  .LBB1_2: ; %trap
+; HSA-TRAP-GFX803-V3-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX803-V3-NEXT:    s_trap 2
+;
+; HSA-TRAP-GFX803-V4-LABEL: non_entry_trap:
+; HSA-TRAP-GFX803-V4:       ; %bb.0: ; %entry
+; HSA-TRAP-GFX803-V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX803-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V4-NEXT:    flat_load_dword v0, v[0:1] glc
+; HSA-TRAP-GFX803-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v0
+; HSA-TRAP-GFX803-V4-NEXT:    s_cbranch_vccz .LBB1_2
+; HSA-TRAP-GFX803-V4-NEXT:  ; %bb.1: ; %ret
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v2, 3
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V4-NEXT:    flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT:    s_endpgm
+; HSA-TRAP-GFX803-V4-NEXT:  .LBB1_2: ; %trap
+; HSA-TRAP-GFX803-V4-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX803-V4-NEXT:    s_trap 2
+;
+; HSA-TRAP-GFX900-V3-LABEL: non_entry_trap:
+; HSA-TRAP-GFX900-V3:       ; %bb.0: ; %entry
+; HSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; HSA-TRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v1
+; HSA-TRAP-GFX900-V3-NEXT:    s_cbranch_vccz .LBB1_2
+; HSA-TRAP-GFX900-V3-NEXT:  ; %bb.1: ; %ret
+; HSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 3
+; HSA-TRAP-GFX900-V3-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-TRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT:    s_endpgm
+; HSA-TRAP-GFX900-V3-NEXT:  .LBB1_2: ; %trap
+; HSA-TRAP-GFX900-V3-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX900-V3-NEXT:    s_trap 2
+;
+; HSA-TRAP-GFX900-V4-LABEL: non_entry_trap:
+; HSA-TRAP-GFX900-V4:       ; %bb.0: ; %entry
+; HSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; HSA-TRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v1
+; HSA-TRAP-GFX900-V4-NEXT:    s_cbranch_vccz .LBB1_2
+; HSA-TRAP-GFX900-V4-NEXT:  ; %bb.1: ; %ret
+; HSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 3
+; HSA-TRAP-GFX900-V4-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-TRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT:    s_endpgm
+; HSA-TRAP-GFX900-V4-NEXT:  .LBB1_2: ; %trap
+; HSA-TRAP-GFX900-V4-NEXT:    s_trap 2
+;
+; HSA-NOTRAP-GFX900-V3-LABEL: non_entry_trap:
+; HSA-NOTRAP-GFX900-V3:       ; %bb.0: ; %entry
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; HSA-NOTRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v1
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_cbranch_vccz .LBB1_2
+; HSA-NOTRAP-GFX900-V3-NEXT:  ; %bb.1: ; %ret
+; HSA-NOTRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 3
+; HSA-NOTRAP-GFX900-V3-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_endpgm
+; HSA-NOTRAP-GFX900-V3-NEXT:  .LBB1_2: ; %trap
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_endpgm
+;
+; HSA-NOTRAP-GFX900-V4-LABEL: non_entry_trap:
+; HSA-NOTRAP-GFX900-V4:       ; %bb.0: ; %entry
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-NOTRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT:    v_cmp_eq_u32_e32 vcc, -1, v1
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_cbranch_vccz .LBB1_2
+; HSA-NOTRAP-GFX900-V4-NEXT:  ; %bb.1: ; %ret
+; HSA-NOTRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 3
+; HSA-NOTRAP-GFX900-V4-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_endpgm
+; HSA-NOTRAP-GFX900-V4-NEXT:  .LBB1_2: ; %trap
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_endpgm
 entry:
   %tmp29 = load volatile i32, ptr addrspace(1) %arg0
   %cmp = icmp eq i32 %tmp29, -1
@@ -143,60 +263,115 @@ ret:
 }
 
 define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
-; NOHSA-TRAP-GFX900-LABEL: debugtrap:
-; NOHSA-TRAP-GFX900:       ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOHSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; NOHSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v1, 1
-; NOHSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v2, 2
-; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
-; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT:    global_store_dword v0, v2, s[0:1]
-; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT:    s_endpgm
-;
-; HSA-TRAP-GFX803-LABEL: debugtrap:
-; HSA-TRAP-GFX803:       ; %bb.0:
-; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v3, 2
-; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
-; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
-; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX803-NEXT:    s_trap 3
-; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v3
-; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX803-NEXT:    s_endpgm
-;
-; HSA-TRAP-GFX900-LABEL: debugtrap:
-; HSA-TRAP-GFX900:       ; %bb.0:
-; HSA-TRAP-GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; HSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v1, 1
-; HSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v2, 2
-; HSA-TRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-TRAP-GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
-; HSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX900-NEXT:    s_trap 3
-; HSA-TRAP-GFX900-NEXT:    global_store_dword v0, v2, s[0:1]
-; HSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; HSA-TRAP-GFX900-NEXT:    s_endpgm
-;
-; HSA-NOTRAP-GFX900-LABEL: debugtrap:
-; HSA-NOTRAP-GFX900:       ; %bb.0:
-; HSA-NOTRAP-GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; HSA-NOTRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; HSA-NOTRAP-GFX900-NEXT:    v_mov_b32_e32 v1, 1
-; HSA-NOTRAP-GFX900-NEXT:    v_mov_b32_e32 v2, 2
-; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT:    global_store_dword v0, v1, s[0:1]
-; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT:    global_store_dword v0, v2, s[0:1]
-; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NOTRAP-GFX900-NEXT:    s_endpgm
+; NOHSA-TRAP-GFX900-V3-LABEL: debugtrap:
+; NOHSA-TRAP-GFX900-V3:       ; %bb.0:
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 1
+; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v2, 2
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT:    global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT:    global_store_dword v0, v2, s[0:1]
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_endpgm
+;
+; NOHSA-TRAP-GFX900-V4-LABEL: debugtrap:
+; NOHSA-TRAP-GFX900-V4:       ; %bb.0:
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 1
+; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v2, 2
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT:    global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT:    global_store_dword v0, v2, s[0:1]
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX803-V3-LABEL: debugtrap:
+; HSA-TRAP-GFX803-V3:       ; %bb.0:
+; HSA-TRAP-GFX803-V3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v3, 2
+; HSA-TRAP-GFX803-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V3-NEXT:    v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V3-NEXT:    flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT:    s_trap 3
+; HSA-TRAP-GFX803-V3-NEXT:    flat_store_dword v[0:1], v3
+; HSA-TRAP-GFX803-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V3-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX803-V4-LABEL: debugtrap:
+; HSA-TRAP-GFX803-V4:       ; %bb.0:
+; HSA-TRAP-GFX803-V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v3, 2
+; HSA-TRAP-GFX803-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-V4-NEXT:    v_mov_b32_e32 v1, s1
+; HSA-TRAP-GFX803-V4-NEXT:    flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT:    s_trap 3
+; HSA-TRAP-GFX803-V4-NEXT:    flat_store_dword v[0:1], v3
+; HSA-TRAP-GFX803-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-V4-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX900-V3-LABEL: debugtrap:
+; HSA-TRAP-GFX900-V3:       ; %bb.0:
+; HSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v2, 2
+; HSA-TRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-TRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT:    s_trap 3
+; HSA-TRAP-GFX900-V3-NEXT:    global_store_dword v0, v2, s[0:1]
+; HSA-TRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V3-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX900-V4-LABEL: debugtrap:
+; HSA-TRAP-GFX900-V4:       ; %bb.0:
+; HSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v2, 2
+; HSA-TRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-TRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT:    s_trap 3
+; HSA-TRAP-GFX900-V4-NEXT:    global_store_dword v0, v2, s[0:1]
+; HSA-TRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-V4-NEXT:    s_endpgm
+;
+; HSA-NOTRAP-GFX900-V3-LABEL: debugtrap:
+; HSA-NOTRAP-GFX900-V3:       ; %bb.0:
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-NOTRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-NOTRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v2, 2
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT:    global_store_dword v0, v2, s[0:1]
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V3-NEXT:    s_endpgm
+;
+; HSA-NOTRAP-GFX900-V4-LABEL: debugtrap:
+; HSA-NOTRAP-GFX900-V4:       ; %bb.0:
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; HSA-NOTRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-NOTRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v2, 2
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT:    global_store_dword v0, v1, s[0:1]
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT:    global_store_dword v0, v2, s[0:1]
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-V4-NEXT:    s_endpgm
   store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.debugtrap()
   store volatile i32 2, ptr addrspace(1) %arg0
@@ -207,4 +382,4 @@ attributes #0 = { nounwind noreturn }
 attributes #1 = { nounwind }
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION}

diff  --git a/llvm/test/MC/AMDGPU/hsa-diag-v4.s b/llvm/test/MC/AMDGPU/hsa-diag-v3.s
similarity index 94%
rename from llvm/test/MC/AMDGPU/hsa-diag-v4.s
rename to llvm/test/MC/AMDGPU/hsa-diag-v3.s
index f7a554aedb746b0..369ac905ad2b27b 100644
--- a/llvm/test/MC/AMDGPU/hsa-diag-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-diag-v3.s
@@ -1,18 +1,18 @@
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX8,PREGFX10,AMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX10,AMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX11,AMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,NONAMDHSA
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GFX90A,PREGFX10,AMDHSA,ALL
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX8,PREGFX10,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX10,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10PLUS,GFX11,AMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd- -mcpu=gfx810 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,NONAMDHSA
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GFX90A,PREGFX10,AMDHSA,ALL
 
 .text
 
 // GCN-LABEL: warning: test_target
 // GFX8-NOT: error:
-// GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1010:xnack+
-// GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-amdhsa--gfx1100
-// NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810:xnack+ does not match the specified target id amdgcn-amd-unknown--gfx810
+// GFX10: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810+xnack does not match the specified target id amdgcn-amd-amdhsa--gfx1010+xnack
+// GFX11: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810+xnack does not match the specified target id amdgcn-amd-amdhsa--gfx1100
+// NONAMDHSA: error: .amdgcn_target directive's target id amdgcn-amd-amdhsa--gfx810+xnack does not match the specified target id amdgcn-amd-unknown--gfx810
 .warning "test_target"
-.amdgcn_target "amdgcn-amd-amdhsa--gfx810:xnack+"
+.amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack"
 
 // GCN-LABEL: warning: test_amdhsa_kernel_no_name
 // GCN: error: unknown directive

diff  --git a/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s
new file mode 100644
index 000000000000000..ba60000837cdc06
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-gfx10-v3.s
@@ -0,0 +1,226 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 --amdhsa-code-object-version=3 -mattr=+xnack -filetype=obj < %s > %t
+// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// READOBJ: Section Headers
+// READOBJ: .text   PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}}        0000c0 {{[0-9]+}}  A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ:      0000000000000000  0 FUNC    LOCAL  PROTECTED 2 minimal
+// READOBJ-NEXT: 0000000000000100  0 FUNC    LOCAL  PROTECTED 2 complete
+// READOBJ-NEXT: 0000000000000200  0 FUNC    LOCAL  PROTECTED 2 special_sgpr
+// READOBJ-NEXT: 0000000000000000 64 OBJECT  LOCAL  DEFAULT   3 minimal.kd
+// READOBJ-NEXT: 0000000000000040 64 OBJECT  LOCAL  DEFAULT   3 complete.kd
+// READOBJ-NEXT: 0000000000000080 64 OBJECT  LOCAL  DEFAULT   3 special_sgpr.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac60 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 015001e4 1f0f007f 7f040000 00000000
+// special_sgpr
+// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00b0 00000060 80000000 00000000 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx1010+xnack"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010+xnack"
+
+.p2align 8
+.type minimal, at function
+minimal:
+  s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+  s_endpgm
+
+.p2align 8
+.type special_sgpr, at function
+special_sgpr:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_shared_vgpr_count 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM: .amdhsa_shared_vgpr_count 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+  .amdhsa_group_segment_fixed_size 1
+  .amdhsa_private_segment_fixed_size 1
+  .amdhsa_kernarg_size 8
+  .amdhsa_user_sgpr_private_segment_buffer 1
+  .amdhsa_user_sgpr_dispatch_ptr 1
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_segment_ptr 1
+  .amdhsa_user_sgpr_dispatch_id 1
+  .amdhsa_user_sgpr_flat_scratch_init 1
+  .amdhsa_user_sgpr_private_segment_size 1
+  .amdhsa_wavefront_size32 1
+  .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+  .amdhsa_system_sgpr_workgroup_id_x 0
+  .amdhsa_system_sgpr_workgroup_id_y 1
+  .amdhsa_system_sgpr_workgroup_id_z 1
+  .amdhsa_system_sgpr_workgroup_info 1
+  .amdhsa_system_vgpr_workitem_id 1
+  .amdhsa_next_free_vgpr 9
+  .amdhsa_next_free_sgpr 27
+  .amdhsa_reserve_vcc 0
+  .amdhsa_reserve_flat_scratch 0
+  .amdhsa_reserve_xnack_mask 1
+  .amdhsa_float_round_mode_32 1
+  .amdhsa_float_round_mode_16_64 1
+  .amdhsa_float_denorm_mode_32 1
+  .amdhsa_float_denorm_mode_16_64 0
+  .amdhsa_dx10_clamp 0
+  .amdhsa_ieee_mode 0
+  .amdhsa_fp16_overflow 1
+  .amdhsa_workgroup_processor_mode 1
+  .amdhsa_memory_ordered 1
+  .amdhsa_forward_progress 1
+  .amdhsa_exception_fp_ieee_invalid_op 1
+  .amdhsa_exception_fp_denorm_src 1
+  .amdhsa_exception_fp_ieee_div_zero 1
+  .amdhsa_exception_fp_ieee_overflow 1
+  .amdhsa_exception_fp_ieee_underflow 1
+  .amdhsa_exception_fp_ieee_inexact 1
+  .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 15
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_wavefront_size32 1
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_workgroup_processor_mode 1
+// ASM-NEXT: .amdhsa_memory_ordered 1
+// ASM-NEXT: .amdhsa_forward_progress 1
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+// Test that we are including special SGPR usage in the granulated count.
+.p2align 6
+.amdhsa_kernel special_sgpr
+  // Same next_free_sgpr as "complete", but...
+  .amdhsa_next_free_sgpr 27
+  // ...on GFX10+ this should require an additional 6 SGPRs, pushing us from
+  // 3 granules to 4
+  .amdhsa_reserve_flat_scratch 1
+
+  .amdhsa_reserve_vcc 0
+  .amdhsa_reserve_xnack_mask 1
+
+  .amdhsa_float_denorm_mode_16_64 0
+  .amdhsa_dx10_clamp 0
+  .amdhsa_ieee_mode 0
+  .amdhsa_next_free_vgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel special_sgpr
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4

diff  --git a/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s
new file mode 100644
index 000000000000000..7f885b457aa63a3
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-gfx11-v3.s
@@ -0,0 +1,213 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 --amdhsa-code-object-version=3 -filetype=obj < %s > %t
+// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// READOBJ: Section Headers
+// READOBJ: .text   PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}}        0000c0 {{[0-9]+}}  A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ:      0000000000000000  0 FUNC    LOCAL  PROTECTED 2 minimal
+// READOBJ-NEXT: 0000000000000100  0 FUNC    LOCAL  PROTECTED 2 complete
+// READOBJ-NEXT: 0000000000000200  0 FUNC    LOCAL  PROTECTED 2 special_sgpr
+// READOBJ-NEXT: 0000000000000000 64 OBJECT  LOCAL  DEFAULT   3 minimal.kd
+// READOBJ-NEXT: 0000000000000040 64 OBJECT  LOCAL  DEFAULT   3 complete.kd
+// READOBJ-NEXT: 0000000000000080 64 OBJECT  LOCAL  DEFAULT   3 special_sgpr.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac60 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 015001e4 130f007f 5e040000 00000000
+// special_sgpr
+// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00b0 00000060 80000000 00000000 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx1100"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100"
+
+.p2align 8
+.type minimal, at function
+minimal:
+  s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+  s_endpgm
+
+.p2align 8
+.type special_sgpr, at function
+special_sgpr:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+  .amdhsa_group_segment_fixed_size 1
+  .amdhsa_private_segment_fixed_size 1
+  .amdhsa_kernarg_size 8
+  .amdhsa_user_sgpr_dispatch_ptr 1
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_segment_ptr 1
+  .amdhsa_user_sgpr_dispatch_id 1
+  .amdhsa_user_sgpr_private_segment_size 1
+  .amdhsa_wavefront_size32 1
+  .amdhsa_enable_private_segment 1
+  .amdhsa_system_sgpr_workgroup_id_x 0
+  .amdhsa_system_sgpr_workgroup_id_y 1
+  .amdhsa_system_sgpr_workgroup_id_z 1
+  .amdhsa_system_sgpr_workgroup_info 1
+  .amdhsa_system_vgpr_workitem_id 1
+  .amdhsa_next_free_vgpr 9
+  .amdhsa_next_free_sgpr 27
+  .amdhsa_reserve_vcc 0
+  .amdhsa_float_round_mode_32 1
+  .amdhsa_float_round_mode_16_64 1
+  .amdhsa_float_denorm_mode_32 1
+  .amdhsa_float_denorm_mode_16_64 0
+  .amdhsa_dx10_clamp 0
+  .amdhsa_ieee_mode 0
+  .amdhsa_fp16_overflow 1
+  .amdhsa_workgroup_processor_mode 1
+  .amdhsa_memory_ordered 1
+  .amdhsa_forward_progress 1
+  .amdhsa_exception_fp_ieee_invalid_op 1
+  .amdhsa_exception_fp_denorm_src 1
+  .amdhsa_exception_fp_ieee_div_zero 1
+  .amdhsa_exception_fp_ieee_overflow 1
+  .amdhsa_exception_fp_ieee_underflow 1
+  .amdhsa_exception_fp_ieee_inexact 1
+  .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 9
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_wavefront_size32 1
+// ASM-NEXT: .amdhsa_enable_private_segment 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_workgroup_processor_mode 1
+// ASM-NEXT: .amdhsa_memory_ordered 1
+// ASM-NEXT: .amdhsa_forward_progress 1
+// ASM-NEXT: .amdhsa_shared_vgpr_count 0
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+// Test that we are including special SGPR usage in the granulated count.
+.p2align 6
+.amdhsa_kernel special_sgpr
+  // Same next_free_sgpr as "complete", but...
+  .amdhsa_next_free_sgpr 27
+  // ...on GFX10+ this should require an additional 6 SGPRs, pushing us from
+  // 3 granules to 4
+
+  .amdhsa_reserve_vcc 0
+
+  .amdhsa_float_denorm_mode_16_64 0
+  .amdhsa_dx10_clamp 0
+  .amdhsa_ieee_mode 0
+  .amdhsa_next_free_vgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel special_sgpr
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 11
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4

diff  --git a/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
new file mode 100644
index 000000000000000..fd84fab8af81685
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s
@@ -0,0 +1,184 @@
+// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=obj < %s > %t
+// RUN: llvm-readobj --elf-output-style=GNU --sections --symbols --relocations %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// READOBJ: Section Headers
+// READOBJ: .text   PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}}        000080 {{[0-9]+}}  A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000100  0 FUNC    LOCAL  PROTECTED 2 complete
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000040 64 OBJECT  LOCAL  DEFAULT   3 complete.kd
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000000  0 FUNC    LOCAL  PROTECTED 2 minimal
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 64 OBJECT  LOCAL  DEFAULT   3 minimal.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0070 c1500104 210f007f 7f008100 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc"
+
+.p2align 8
+.type minimal, at function
+minimal:
+  s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM: .amdhsa_tg_split 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+  .amdhsa_group_segment_fixed_size 1
+  .amdhsa_private_segment_fixed_size 1
+  .amdhsa_user_sgpr_private_segment_buffer 1
+  .amdhsa_user_sgpr_dispatch_ptr 1
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_segment_ptr 1
+  .amdhsa_user_sgpr_dispatch_id 1
+  .amdhsa_user_sgpr_flat_scratch_init 1
+  .amdhsa_kernarg_size 8
+  .amdhsa_user_sgpr_kernarg_preload_length  1
+  .amdhsa_user_sgpr_kernarg_preload_offset  1
+  .amdhsa_user_sgpr_private_segment_size 1
+  .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+  .amdhsa_system_sgpr_workgroup_id_x 0
+  .amdhsa_system_sgpr_workgroup_id_y 1
+  .amdhsa_system_sgpr_workgroup_id_z 1
+  .amdhsa_system_sgpr_workgroup_info 1
+  .amdhsa_system_vgpr_workitem_id 1
+  .amdhsa_next_free_vgpr 9
+  .amdhsa_next_free_sgpr 27
+  .amdhsa_accum_offset 4
+  .amdhsa_reserve_vcc 0
+  .amdhsa_reserve_flat_scratch 0
+  .amdhsa_float_round_mode_32 1
+  .amdhsa_float_round_mode_16_64 1
+  .amdhsa_float_denorm_mode_32 1
+  .amdhsa_float_denorm_mode_16_64 0
+  .amdhsa_dx10_clamp 0
+  .amdhsa_ieee_mode 0
+  .amdhsa_fp16_overflow 1
+  .amdhsa_tg_split 1
+  .amdhsa_exception_fp_ieee_invalid_op 1
+  .amdhsa_exception_fp_denorm_src 1
+  .amdhsa_exception_fp_ieee_div_zero 1
+  .amdhsa_exception_fp_ieee_overflow 1
+  .amdhsa_exception_fp_ieee_underflow 1
+  .amdhsa_exception_fp_ieee_inexact 1
+  .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 16
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length  1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset  1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_tg_split 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 9
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4

diff  --git a/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s
new file mode 100644
index 000000000000000..9624515ecd6fb90
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-gfx940-v3.s
@@ -0,0 +1,178 @@
+// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s > %t
+// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// READOBJ: Section Headers
+// READOBJ: .text   PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}}        000080 {{[0-9]+}}  A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000100  0 FUNC    LOCAL  PROTECTED 2 complete
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000040 64 OBJECT  LOCAL  DEFAULT   3 complete.kd
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000000  0 FUNC    LOCAL  PROTECTED 2 minimal
+// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 64 OBJECT  LOCAL  DEFAULT   3 minimal.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100
+// OBJDUMP-NEXT: 0070 01510104 150f007f 5e008100 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx940+xnack+sram-ecc"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx940+xnack+sram-ecc"
+
+.p2align 8
+.type minimal, at function
+minimal:
+  s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+  .amdhsa_accum_offset 4
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM: .amdhsa_tg_split 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+  .amdhsa_group_segment_fixed_size 1
+  .amdhsa_private_segment_fixed_size 1
+  .amdhsa_user_sgpr_dispatch_ptr 1
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_segment_ptr 1
+  .amdhsa_user_sgpr_dispatch_id 1
+  .amdhsa_kernarg_size 8
+  .amdhsa_user_sgpr_kernarg_preload_length  1
+  .amdhsa_user_sgpr_kernarg_preload_offset  1
+  .amdhsa_user_sgpr_private_segment_size 1
+  .amdhsa_enable_private_segment 1
+  .amdhsa_system_sgpr_workgroup_id_x 0
+  .amdhsa_system_sgpr_workgroup_id_y 1
+  .amdhsa_system_sgpr_workgroup_id_z 1
+  .amdhsa_system_sgpr_workgroup_info 1
+  .amdhsa_system_vgpr_workitem_id 1
+  .amdhsa_next_free_vgpr 9
+  .amdhsa_next_free_sgpr 27
+  .amdhsa_accum_offset 4
+  .amdhsa_reserve_vcc 0
+  .amdhsa_float_round_mode_32 1
+  .amdhsa_float_round_mode_16_64 1
+  .amdhsa_float_denorm_mode_32 1
+  .amdhsa_float_denorm_mode_16_64 0
+  .amdhsa_dx10_clamp 0
+  .amdhsa_ieee_mode 0
+  .amdhsa_fp16_overflow 1
+  .amdhsa_tg_split 1
+  .amdhsa_exception_fp_ieee_invalid_op 1
+  .amdhsa_exception_fp_denorm_src 1
+  .amdhsa_exception_fp_ieee_div_zero 1
+  .amdhsa_exception_fp_ieee_overflow 1
+  .amdhsa_exception_fp_ieee_underflow 1
+  .amdhsa_exception_fp_ieee_inexact 1
+  .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 10
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length  1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset  1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_enable_private_segment 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_accum_offset 4
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_tg_split 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 9
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4

diff  --git a/llvm/test/MC/AMDGPU/hsa-v3.s b/llvm/test/MC/AMDGPU/hsa-v3.s
new file mode 100644
index 000000000000000..9f854986d7bc447
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/hsa-v3.s
@@ -0,0 +1,304 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=3 -mattr=+xnack -filetype=obj < %s > %t
+// RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
+// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
+
+// READOBJ: Section Headers
+// READOBJ: .text   PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256
+// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}}        000100 {{[0-9]+}}  A {{[0-9]+}} {{[0-9]+}} 64
+
+// READOBJ: Relocation section '.rela.rodata' at offset
+// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10
+// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110
+// READOBJ: 0000000000000090 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 210
+// READOBJ: 00000000000000d0 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 310
+
+// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries:
+// READOBJ:      0000000000000000  0 FUNC    LOCAL  PROTECTED 2 minimal
+// READOBJ-NEXT: 0000000000000100  0 FUNC    LOCAL  PROTECTED 2 complete
+// READOBJ-NEXT: 0000000000000200  0 FUNC    LOCAL  PROTECTED 2 special_sgpr
+// READOBJ-NEXT: 0000000000000300  0 FUNC    LOCAL  PROTECTED 2 disabled_user_sgpr
+// READOBJ-NEXT: 0000000000000000 64 OBJECT  LOCAL  DEFAULT   3 minimal.kd
+// READOBJ-NEXT: 0000000000000040 64 OBJECT  LOCAL  DEFAULT   3 complete.kd
+// READOBJ-NEXT: 0000000000000080 64 OBJECT  LOCAL  DEFAULT   3 special_sgpr.kd
+// READOBJ-NEXT: 00000000000000c0 64 OBJECT  LOCAL  DEFAULT   3 disabled_user_sgpr.kd
+
+// OBJDUMP: Contents of section .rodata
+// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here.
+// minimal
+// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000
+// complete
+// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
+// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0070 c2500104 1f0f007f 7f000000 00000000
+// special_sgpr
+// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00b0 00010000 80000000 00000000 00000000
+// disabled_user_sgpr
+// OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
+// OBJDUMP-NEXT: 00f0 0000ac00 80000000 00000000 00000000
+
+.text
+// ASM: .text
+
+.amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack"
+
+.p2align 8
+.type minimal, at function
+minimal:
+  s_endpgm
+
+.p2align 8
+.type complete, at function
+complete:
+  s_endpgm
+
+.p2align 8
+.type special_sgpr, at function
+special_sgpr:
+  s_endpgm
+
+.p2align 8
+.type disabled_user_sgpr, at function
+disabled_user_sgpr:
+  s_endpgm
+
+.rodata
+// ASM: .rodata
+
+// Test that only specifying required directives is allowed, and that defaulted
+// values are omitted.
+.p2align 6
+.amdhsa_kernel minimal
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel minimal
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM: .end_amdhsa_kernel
+
+// Test that we can specify all available directives with non-default values.
+.p2align 6
+.amdhsa_kernel complete
+  .amdhsa_group_segment_fixed_size 1
+  .amdhsa_private_segment_fixed_size 1
+  .amdhsa_kernarg_size 8
+  .amdhsa_user_sgpr_private_segment_buffer 1
+  .amdhsa_user_sgpr_dispatch_ptr 1
+  .amdhsa_user_sgpr_queue_ptr 1
+  .amdhsa_user_sgpr_kernarg_segment_ptr 1
+  .amdhsa_user_sgpr_dispatch_id 1
+  .amdhsa_user_sgpr_flat_scratch_init 1
+  .amdhsa_user_sgpr_private_segment_size 1
+  .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+  .amdhsa_system_sgpr_workgroup_id_x 0
+  .amdhsa_system_sgpr_workgroup_id_y 1
+  .amdhsa_system_sgpr_workgroup_id_z 1
+  .amdhsa_system_sgpr_workgroup_info 1
+  .amdhsa_system_vgpr_workitem_id 1
+  .amdhsa_next_free_vgpr 9
+  .amdhsa_next_free_sgpr 27
+  .amdhsa_reserve_vcc 0
+  .amdhsa_reserve_flat_scratch 0
+  .amdhsa_reserve_xnack_mask 1
+  .amdhsa_float_round_mode_32 1
+  .amdhsa_float_round_mode_16_64 1
+  .amdhsa_float_denorm_mode_32 1
+  .amdhsa_float_denorm_mode_16_64 0
+  .amdhsa_dx10_clamp 0
+  .amdhsa_ieee_mode 0
+  .amdhsa_fp16_overflow 1
+  .amdhsa_exception_fp_ieee_invalid_op 1
+  .amdhsa_exception_fp_denorm_src 1
+  .amdhsa_exception_fp_ieee_div_zero 1
+  .amdhsa_exception_fp_ieee_overflow 1
+  .amdhsa_exception_fp_ieee_underflow 1
+  .amdhsa_exception_fp_ieee_inexact 1
+  .amdhsa_exception_int_div_zero 1
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel complete
+// ASM-NEXT: .amdhsa_group_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_private_segment_fixed_size 1
+// ASM-NEXT: .amdhsa_kernarg_size 8
+// ASM-NEXT: .amdhsa_user_sgpr_count 15
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1
+// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
+// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1
+// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
+// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1
+// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1
+// ASM-NEXT: .amdhsa_next_free_vgpr 9
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_flat_scratch 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM-NEXT: .amdhsa_float_round_mode_32 1
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 1
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM-NEXT: .amdhsa_fp16_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1
+// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1
+// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1
+// ASM-NEXT: .amdhsa_exception_int_div_zero 1
+// ASM-NEXT: .end_amdhsa_kernel
+
+// Test that we are including special SGPR usage in the granulated count.
+.p2align 6
+.amdhsa_kernel special_sgpr
+  // Same next_free_sgpr as "complete", but...
+  .amdhsa_next_free_sgpr 27
+  // ...on GFX9 this should require an additional 6 SGPRs, pushing us from
+  // 3 granules to 4
+  .amdhsa_reserve_flat_scratch 1
+
+  .amdhsa_reserve_vcc 0
+  .amdhsa_reserve_xnack_mask 1
+
+  .amdhsa_float_denorm_mode_16_64 0
+  .amdhsa_dx10_clamp 0
+  .amdhsa_ieee_mode 0
+  .amdhsa_next_free_vgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel special_sgpr
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 27
+// ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
+// ASM: .amdhsa_float_denorm_mode_16_64 0
+// ASM-NEXT: .amdhsa_dx10_clamp 0
+// ASM-NEXT: .amdhsa_ieee_mode 0
+// ASM: .end_amdhsa_kernel
+
+// Test that explicitly disabling user_sgpr's does not affect the user_sgpr
+// count, i.e. this should produce the same descriptor as minimal.
+.p2align 6
+.amdhsa_kernel disabled_user_sgpr
+  .amdhsa_user_sgpr_private_segment_buffer 0
+  .amdhsa_next_free_vgpr 0
+  .amdhsa_next_free_sgpr 0
+.end_amdhsa_kernel
+
+// ASM: .amdhsa_kernel disabled_user_sgpr
+// ASM: .amdhsa_next_free_vgpr 0
+// ASM-NEXT: .amdhsa_next_free_sgpr 0
+// ASM: .end_amdhsa_kernel
+
+.section .foo
+
+.byte .amdgcn.gfx_generation_number
+// ASM: .byte 9
+
+.byte .amdgcn.gfx_generation_minor
+// ASM: .byte 0
+
+.byte .amdgcn.gfx_generation_stepping
+// ASM: .byte 4
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v7, s10
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 8
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 11
+
+.set .amdgcn.next_free_vgpr, 0
+.set .amdgcn.next_free_sgpr, 0
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 0
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 0
+
+v_mov_b32_e32 v16, s3
+
+.byte .amdgcn.next_free_vgpr
+// ASM: .byte 17
+.byte .amdgcn.next_free_sgpr
+// ASM: .byte 4
+
+// Metadata
+
+.amdgpu_metadata
+  amdhsa.version:
+    - 3
+    - 0
+  amdhsa.kernels:
+    - .name:       amd_kernel_code_t_test_all
+      .symbol: amd_kernel_code_t_test_all at kd
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+    - .name:       amd_kernel_code_t_minimal
+      .symbol: amd_kernel_code_t_minimal at kd
+      .kernarg_segment_size: 8
+      .group_segment_fixed_size: 16
+      .private_segment_fixed_size: 32
+      .kernarg_segment_align: 64
+      .wavefront_size: 128
+      .sgpr_count: 14
+      .vgpr_count: 40
+      .max_flat_workgroup_size: 256
+.end_amdgpu_metadata
+
+// ASM:      	.amdgpu_metadata
+// ASM:      amdhsa.kernels:
+// ASM:        - .group_segment_fixed_size: 16
+// ASM:          .kernarg_segment_align: 64
+// ASM:          .kernarg_segment_size: 8
+// ASM:          .max_flat_workgroup_size: 256
+// ASM:          .name:           amd_kernel_code_t_test_all
+// ASM:          .private_segment_fixed_size: 32
+// ASM:          .sgpr_count:     14
+// ASM:          .symbol:         'amd_kernel_code_t_test_all at kd'
+// ASM:          .vgpr_count:     40
+// ASM:          .wavefront_size: 128
+// ASM:        - .group_segment_fixed_size: 16
+// ASM:          .kernarg_segment_align: 64
+// ASM:          .kernarg_segment_size: 8
+// ASM:          .max_flat_workgroup_size: 256
+// ASM:          .name:           amd_kernel_code_t_minimal
+// ASM:          .private_segment_fixed_size: 32
+// ASM:          .sgpr_count:     14
+// ASM:          .symbol:         'amd_kernel_code_t_minimal at kd'
+// ASM:          .vgpr_count:     40
+// ASM:          .wavefront_size: 128
+// ASM:      amdhsa.version:
+// ASM-NEXT:   - 3
+// ASM-NEXT:   - 0
+// ASM:      	.end_amdgpu_metadata

diff  --git a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
index 7e3ae8424cc7bdc..63e532e0ffa3768 100644
--- a/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
+++ b/llvm/test/MC/AMDGPU/user-sgpr-count-diag.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc --amdhsa-code-object-version=4 -triple amdgcn-amd-amdhsa -mcpu=gfx90a %s 2>&1 >/dev/null | FileCheck -check-prefix=ERR %s
+// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a %s 2>&1 >/dev/null | FileCheck -check-prefix=ERR %s
 
 .amdhsa_kernel implied_count_too_low_0
   .amdhsa_user_sgpr_count 0

diff  --git a/llvm/test/MC/AMDGPU/user-sgpr-count.s b/llvm/test/MC/AMDGPU/user-sgpr-count.s
index 950c514f786b254..aa8970185eb04bc 100644
--- a/llvm/test/MC/AMDGPU/user-sgpr-count.s
+++ b/llvm/test/MC/AMDGPU/user-sgpr-count.s
@@ -1,10 +1,10 @@
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a --amdhsa-code-object-version=4 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a --amdhsa-code-object-version=3 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
 
 .text
 // ASM: .text
 
-.amdgcn_target "amdgcn-amd-amdhsa--gfx90a:xnack+"
-// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a:xnack+"
+.amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc"
+// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack+sram-ecc"
 
 
 // ASM-LABEL: .amdhsa_kernel user_sgprs_implied_count


        


More information about the llvm-commits mailing list