[llvm] 7470244 - [AMDGPU] Add agpr_count to metadata and AsmParser

Wed Feb 16 15:17:56 PST 2022

Author: Jacob Lambert
Date: 2022-02-16T15:17:23-08:00
New Revision: 7470244475f85e9c8d7c46da87780935860d8026

URL: https://github.com/llvm/llvm-project/commit/7470244475f85e9c8d7c46da87780935860d8026
DIFF: https://github.com/llvm/llvm-project/commit/7470244475f85e9c8d7c46da87780935860d8026.diff

LOG: [AMDGPU] Add agpr_count to metadata and AsmParser

gfx90a allows the number of ACC registers (AGPRs) to be set
independently to the VGPR registers. For both HSA and PAL metadata, we
now include an "agpr_count" key to report the number of AGPRs set for
supported devices (gfx90a, gfx908, as determined by hasMAIInsts()).
This is collected from SIProgramInfo.NumAccVGPR for both HSA and PAL.
The AsmParser also now recognizes ".kernel.agpr_count" for supported
devices.

Differential Revision: https://reviews.llvm.org/D116140

Added: 
    llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll
    llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s

Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
    llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
    llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h

Removed: 
    


################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 4233edd899eea..5fae9129bc6a5 100644

--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -3185,6 +3185,10 @@ same *vendor-name*.
                                                                   if a higher numbered
                                                                   register is used
                                                                   explicitly.
+     ".agpr_count"                       integer        Required  Number of accumulator
+                                                                  registers required by
+                                                                  each work-item for
+                                                                  GFX90A, GFX908.
      ".max_flat_workgroup_size"          integer        Required  Maximum flat
                                                                   work-group size
                                                                   supported by the
@@ -11431,6 +11435,7 @@ within a map that has been added by the same *vendor-name*.
      ".lds_size"                integer                  Local Data Share size in bytes.
      ".perf_data_buffer_size"   integer                  Performance data buffer size in bytes.
      ".vgpr_count"              integer                  Number of VGPRs used.
+     ".agpr_count"              integer                  Number of AGPRs used.
      ".sgpr_count"              integer                  Number of SGPRs used.
      ".vgpr_limit"              integer                  If non-zero, indicates the shader was compiled with a
                                                          directive to instruct the compiler to limit the VGPR usage to

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 26c06d2155ca4..b7a16fde574bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1001,6 +1001,13 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
 
   MD->setEntryPoint(CC, MF.getFunction().getName());
   MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+
+  // Only set AGPRs for supported devices
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  if (STM.hasMAIInsts()) {
+    MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+  }
+
   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
   MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
   if (AMDGPU::isCompute(CC)) {
@@ -1017,7 +1024,6 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
     MD->setSpiPsInputAddr(MFI->getPSInputAddr());
   }
 
-  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   if (STM.isWave32())
     MD->setWave32(MF.getFunction().getCallingConv());
 }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 8cc5c1345b0f3..e1e3e6621ee58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -877,6 +877,12 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
       Kern.getDocument()->getNode(STM.getWavefrontSize());
   Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
   Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
+
+  // Only add AGPR count to metadata for supported devices
+  if (STM.hasMAIInsts()) {
+    Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR);
+  }
+
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
   Kern[".sgpr_spill_count"] =

diff  --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index d20b4918bf09a..d348a4c7e9091 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1125,7 +1125,9 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
 class KernelScopeInfo {
   int SgprIndexUnusedMin = -1;
   int VgprIndexUnusedMin = -1;
+  int AgprIndexUnusedMin = -1;
   MCContext *Ctx = nullptr;
+  MCSubtargetInfo const *MSTI = nullptr;
 
   void usesSgprAt(int i) {
     if (i >= SgprIndexUnusedMin) {
@@ -1144,7 +1146,31 @@ class KernelScopeInfo {
       if (Ctx) {
         MCSymbol* const Sym =
           Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
-        Sym->setVariableValue(MCConstantExpr::create(VgprIndexUnusedMin, *Ctx));
+        int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin,
+                                         VgprIndexUnusedMin);
+        Sym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx));
+      }
+    }
+  }
+
+  void usesAgprAt(int i) {
+    // Instruction will error in AMDGPUAsmParser::MatchAndEmitInstruction
+    if (!hasMAIInsts(*MSTI))
+      return;
+
+    if (i >= AgprIndexUnusedMin) {
+      AgprIndexUnusedMin = ++i;
+      if (Ctx) {
+        MCSymbol* const Sym =
+          Ctx->getOrCreateSymbol(Twine(".kernel.agpr_count"));
+        Sym->setVariableValue(MCConstantExpr::create(AgprIndexUnusedMin, *Ctx));
+
+        // Also update vgpr_count (dependent on agpr_count for gfx908/gfx90a)
+        MCSymbol* const vSym =
+          Ctx->getOrCreateSymbol(Twine(".kernel.vgpr_count"));
+        int totalVGPR = getTotalNumVGPRs(isGFX90A(*MSTI), AgprIndexUnusedMin,
+                                         VgprIndexUnusedMin);
+        vSym->setVariableValue(MCConstantExpr::create(totalVGPR, *Ctx));
       }
     }
   }
@@ -1154,14 +1180,19 @@ class KernelScopeInfo {
 
   void initialize(MCContext &Context) {
     Ctx = &Context;
+    MSTI = Ctx->getSubtargetInfo();
+
     usesSgprAt(SgprIndexUnusedMin = -1);
     usesVgprAt(VgprIndexUnusedMin = -1);
+    if (hasMAIInsts(*MSTI)) {
+      usesAgprAt(AgprIndexUnusedMin = -1);
+    }
   }
 
   void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
     switch (RegKind) {
       case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
-      case IS_AGPR: // fall through
+      case IS_AGPR: usesAgprAt(DwordRegIndex + RegWidth - 1); break;
       case IS_VGPR: usesVgprAt(DwordRegIndex + RegWidth - 1); break;
       default: break;
     }

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 66c99fea052dc..fe345723e12ea 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1523,6 +1523,10 @@ bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
 }
 
+bool hasMAIInsts(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureMAIInsts];
+}
+
 int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR,
                          int32_t ArgNumVGPR) {
   if (has90AInsts && ArgNumAGPR)

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 2086684e1255d..7df0eab964e62 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -775,6 +775,7 @@ bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
 bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
 bool isGFX90A(const MCSubtargetInfo &STI);
 bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
+bool hasMAIInsts(const MCSubtargetInfo &STI);
 int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
 
 /// Is Reg - scalar register

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index f6b5975f19347..4ad93f7b0b682 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -209,6 +209,11 @@ void AMDGPUPALMetadata::setNumUsedVgprs(CallingConv::ID CC, unsigned Val) {
   getHwStage(CC)[".vgpr_count"] = MsgPackDoc.getNode(Val);
 }
 
+// Set the number of used agprs in the metadata.
+void AMDGPUPALMetadata::setNumUsedAgprs(CallingConv::ID CC, unsigned Val) {
+  getHwStage(CC)[".agpr_count"] = Val;
+}
+
 // Set the number of used sgprs in the metadata. This is an optional advisory
 // record for logging etc; wave dispatch actually uses the rsrc1 register for
 // the shader stage to determine the number of sgprs to allocate.

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 7fdd9a8429c15..a45a799e38a9b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -69,6 +69,10 @@ class AMDGPUPALMetadata {
   // the shader stage to determine the number of vgprs to allocate.
   void setNumUsedVgprs(unsigned CC, unsigned Val);
 
+  // Set the number of used agprs in the metadata. This is an optional advisory
+  // record for logging etc;
+  void setNumUsedAgprs(unsigned CC, unsigned Val);
+
   // Set the number of used sgprs in the metadata. This is an optional advisory
   // record for logging etc; wave dispatch actually uses the rsrc1 register for
   // the shader stage to determine the number of sgprs to allocate.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
new file mode 100644
index 0000000000000..99a7ae37e0e78
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
@@ -0,0 +1,78 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+
+; COM: Adapted from agpr-register-count.ll
+; COM: GFX900 and below should not have .agpr_count present in the metadata
+
+
+; CHECK:      .type          kernel_32_agprs
+; CHECK:      NumAgprs:       32
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; CHECK:      .type          kernel_0_agprs
+; CHECK:      NumAgprs:       0
+define amdgpu_kernel void @kernel_0_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v0}" ()
+  ret void
+}
+
+; CHECK:      .type           kernel_40_vgprs
+; CHECK:      NumAgprs:       16
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v39}" ()
+  call void asm sideeffect "", "~{a15}" ()
+  ret void
+}
+
+; CHECK:      .type          kernel_max_gprs
+; CHECK:      NumAgprs:       256
+define amdgpu_kernel void @kernel_max_gprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v255}" ()
+  call void asm sideeffect "", "~{a255}" ()
+  ret void
+}
+
+; CHECK:      .type          func_32_agprs
+; CHECK:      NumAgprs:       32
+define void @func_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; CHECK:      .type          kernel_call_func_32_agprs
+; CHECK:      NumAgprs:       32
+define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
+bb:
+  call void @func_32_agprs() #0
+  ret void
+}
+
+declare void @undef_func()
+
+; CHECK:      .type          kernel_call_undef_func
+; CHECK:      NumAgprs:       32
+define amdgpu_kernel void @kernel_call_undef_func() #0 {
+bb:
+  call void @undef_func()
+  ret void
+}
+
+; CHECK: ---
+; CHECK:  amdpal.pipelines:
+; GFX90A: agpr_count:  0x20
+; GFX90A: vgpr_count:  0x40
+
+; GFX908: agpr_count:  0x20
+; GFX908: vgpr_count:  0x20
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
new file mode 100644
index 0000000000000..b6eff8846dc8c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-register-count.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX801 %s
+
+; COM: Adapted from agpr-register-count.ll
+; COM: GFX900 and below should not have .agpr_count present in the metadata
+
+; CHECK: ---
+; CHECK:  amdhsa.kernels:
+
+; GFX90A:    - .agpr_count:    32
+; GFX908:    - .agpr_count:    32
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_32_agprs
+; GFX90A:      .vgpr_count:    44
+; GFX908:      .vgpr_count:    32
+; GFX801:      .vgpr_count:    9
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    0
+; GFX908:    - .agpr_count:    0
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_0_agprs
+; GFX90A:      .vgpr_count:    1
+; GFX908:      .vgpr_count:    1
+; GFX801:      .vgpr_count:    1
+define amdgpu_kernel void @kernel_0_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v0}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    16
+; GFX908:    - .agpr_count:    16
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_40_vgprs
+; GFX90A:      .vgpr_count:    56
+; GFX908:      .vgpr_count:    40
+; GFX801:      .vgpr_count:    40
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v39}" ()
+  call void asm sideeffect "", "~{a15}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    256
+; GFX908:    - .agpr_count:    256
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_max_gprs
+; GFX90A:      .vgpr_count:    512
+; GFX908:      .vgpr_count:    256
+; GFX801:      .vgpr_count:    256
+define amdgpu_kernel void @kernel_max_gprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v255}" ()
+  call void asm sideeffect "", "~{a255}" ()
+  ret void
+}
+
+define void @func_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; GFX90A:    - .agpr_count:    32
+; GFX908:    - .agpr_count:    32
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_call_func_32_agprs
+; GFX90A:      .vgpr_count:    44
+; GFX908:      .vgpr_count:    32
+; GFX801:      .vgpr_count:    9
+define amdgpu_kernel void @kernel_call_func_32_agprs() #0 {
+bb:
+  call void @func_32_agprs() #0
+  ret void
+}
+
+declare void @undef_func()
+
+; GFX90A:    - .agpr_count:    32
+; GFX908:    - .agpr_count:    32
+; GFX801-NOT:    - .agpr_count:
+; CHECK:      .name:          kernel_call_undef_func
+; GFX90A:      .vgpr_count:    64
+; GFX908:      .vgpr_count:    32
+; GFX801:      .vgpr_count:    32
+define amdgpu_kernel void @kernel_call_undef_func() #0 {
+bb:
+  call void @undef_func()
+  ret void
+}
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll
new file mode 100644
index 0000000000000..5ec1502899edf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-agpr-small.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=CHECK,GFX908 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=CHECK,GFX90A %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx801 < %s | FileCheck -check-prefixes=CHECK,GFX801 %s
+
+; COM: Comments for each kernel
+; CHECK: kernel_32_agprs
+; GFX908:   ; NumVgprs: 9
+; GFX908    ; NumAgprs: 32
+; GFX908    ; TotalNumVgprs: 32
+
+; GFX90A:   ; NumVgprs: 9
+; GFX90A    ; NumAgprs: 32
+; GFX90A    ; TotalNumVgprs: 44
+
+; GFX801:   ; NumVgprs: 9
+
+; CHECK: kernel_40_vgprs
+; GFX908:   ; NumVgprs: 40
+; GFX908    ; NumAgprs: 16
+; GFX908    ; TotalNumVgprs: 40
+
+; GFX90A:   ; NumVgprs: 40
+; GFX90A    ; NumAgprs: 16
+; GFX90A    ; TotalNumVgprs: 56
+
+; GFX801:   ; NumVgprs: 40
+
+; COM: Metadata
+; GFX908:    - .agpr_count:    32
+; GFX908:      .vgpr_count:    32
+
+; GFX90A:    - .agpr_count:    32
+; GFX90A:      .vgpr_count:    44
+
+; GFX801:      .vgpr_count:    9
+define amdgpu_kernel void @kernel_32_agprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v8}" ()
+  call void asm sideeffect "", "~{a31}" ()
+  ret void
+}
+
+; GFX908:    - .agpr_count:    16
+; GFX908:      .vgpr_count:    40
+
+; GFX90A:    - .agpr_count:    16
+; GFX90A:      .vgpr_count:    56
+
+; GFX801:      .vgpr_count:    40
+define amdgpu_kernel void @kernel_40_vgprs() #0 {
+bb:
+  call void asm sideeffect "", "~{v39}" ()
+  call void asm sideeffect "", "~{a15}" ()
+  ret void
+}
+
+attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" }

diff  --git a/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s b/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s
new file mode 100644
index 0000000000000..ea065ea9ef5a9
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/sym_kernel_scope_agpr.s
@@ -0,0 +1,62 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck -check-prefixes=GFX90A %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck -check-prefixes=GFX908 %s
+// Based on sym_kernel_scope.s
+
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+
+    v_accvgpr_write_b32 a0, v6
+    v_accvgpr_read_b32 v3, a3
+    s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 4
+// GFX908: .byte 4
+.byte .kernel.vgpr_count
+// GFX90A: .byte 12
+// GFX908: .byte 7
+
+.amdgpu_hsa_kernel K1
+K1:
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+    v_accvgpr_write_b32 a44, v6
+    s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 45
+// GFX908: .byte 45
+.byte .kernel.vgpr_count
+// GFX90A: .byte 53
+// GFX908: .byte 45
+
+.amdgpu_hsa_kernel K2
+.byte .kernel.agpr_count
+// CHECK: .byte 0
+.byte .kernel.vgpr_count
+// CHECK: .byte 0
+K2:
+    v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3
+    s_endpgm
+.byte .kernel.agpr_count
+// GFX90A: .byte 4
+// GFX908: .byte 4
+.byte .kernel.vgpr_count
+// GFX90A: .byte 8
+// GFX908: .byte 4
+
+.text
+.amdgpu_hsa_kernel K3
+K3:
+    v_accvgpr_read_b32 v[0], a0
+    v_mfma_f32_16x16x1f32 a[0:15], v1, v0, a[0:15] cbsz:1 abid:2 blgp:3
+    s_endpgm
+
+.byte .kernel.agpr_count
+// GFX90A: .byte 16
+// GFX908: .byte 16
+.byte .kernel.vgpr_count
+// GFX90A: .byte 20
+// GFX908: .byte 16