[llvm] fc83f1d - [AMDGPU] Add backend support for new PAL ELF Metadata 3.0

David Stuttard via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 14 01:59:54 PDT 2023


Author: David Stuttard
Date: 2023-04-14T09:57:13+01:00
New Revision: fc83f1de5d5f6d49de6d62fef6379fb1aab497d3

URL: https://github.com/llvm/llvm-project/commit/fc83f1de5d5f6d49de6d62fef6379fb1aab497d3
DIFF: https://github.com/llvm/llvm-project/commit/fc83f1de5d5f6d49de6d62fef6379fb1aab497d3.diff

LOG: [AMDGPU] Add backend support for new PAL ELF Metadata 3.0

PAL Metadata 3.0 introduces an explicit structure in metadata for the
programmable registers written out by the compiler backend.
Rather than using opaque registers which can change between different
architectures and requires encoding the bitfield information in the backend,
which may change between versions.

This is the initial minimal implementation that enables the use of PAL Metadata
3.0.

The change itself should be NFC for non-PAL, although the way RSRC2 register is
handled has been changed slightly.

The test is fairly minimal, but checks that the metadata format looks as
expected and verifies a couple of special cases such as tgid_[xyz]_en handling
and PsInputAddr/Ena which also change to explicit fields.

Differential Revision: https://reviews.llvm.org/D147143

Added: 
    llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
    llvm/lib/Target/AMDGPU/SIProgramInfo.h
    llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 82c57dfcef0d8..1d29be9fcd00f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -441,7 +441,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
 
   assert(isUInt<32>(PI.ScratchSize));
   assert(isUInt<32>(PI.getComputePGMRSrc1()));
-  assert(isUInt<32>(PI.ComputePGMRSrc2));
+  assert(isUInt<32>(PI.getComputePGMRSrc2()));
 
   KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
   KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
@@ -450,7 +450,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
   KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
 
   KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
-  KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
+  KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
   KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
 
   assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
@@ -579,28 +579,27 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     OutStreamer->emitRawComment(
       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
 
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
-      Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:USER_SGPR: " +
-      Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
-      Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
-      Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
-      Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
-      Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
-    OutStreamer->emitRawComment(
-      " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
-      Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
-      false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
+                                    Twine(CurrentProgramInfo.ScratchEnable),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
+                                    Twine(CurrentProgramInfo.UserSGPR),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+                                    Twine(CurrentProgramInfo.TrapHandlerEnable),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+                                    Twine(CurrentProgramInfo.TGIdXEnable),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+                                    Twine(CurrentProgramInfo.TGIdYEnable),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+                                    Twine(CurrentProgramInfo.TGIdZEnable),
+                                false);
+    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+                                    Twine(CurrentProgramInfo.TIdIGCompCount),
+                                false);
 
     assert(STM.hasGFX90AInsts() ||
            CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
@@ -922,22 +921,21 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // anything to disable it if we know the stack isn't used here. We may still
   // have emitted code reading it to initialize scratch, but if that's unused
   // reading garbage should be OK.
-  const bool EnablePrivateSegment =
+  ProgInfo.ScratchEnable =
       ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
-  ProgInfo.ComputePGMRSrc2 =
-      S_00B84C_SCRATCH_EN(EnablePrivateSegment) |
-      S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
-      // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
-      S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
-      S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
-      S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
-      S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
-      S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
-      S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
-      S_00B84C_EXCP_EN_MSB(0) |
-      // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
-      S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
-      S_00B84C_EXCP_EN(0);
+  ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
+  // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
+  ProgInfo.TrapHandlerEnable =
+      STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
+  ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
+  ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
+  ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
+  ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
+  ProgInfo.TIdIGCompCount = TIDIGCompCnt;
+  ProgInfo.EXCPEnMSB = 0;
+  // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
+  ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
+  ProgInfo.EXCPEnable = 0;
 
   if (STM.hasGFX90AInsts()) {
     AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
@@ -978,7 +976,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
 
     OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
-    OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
+    OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
 
     OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
     OutStreamer->emitInt32(
@@ -1038,25 +1036,87 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
   }
 
   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
-  MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
-  if (AMDGPU::isCompute(CC)) {
-    MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
+  if (MD->getPALMajorVersion() < 3) {
+    MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
+    if (AMDGPU::isCompute(CC)) {
+      MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
+    } else {
+      if (CurrentProgramInfo.ScratchBlocks > 0)
+        MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
+    }
   } else {
-    if (CurrentProgramInfo.ScratchBlocks > 0)
-      MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
+    // Priority?
+    MD->setHwStage(CC, ".float_mode", CurrentProgramInfo.FloatMode);
+    // Priv?
+    // DX10Clamp?
+    MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
+    MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
+    MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
+    MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
+
+    if (AMDGPU::isCompute(CC)) {
+      MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+      MD->setHwStage(CC, ".trap_present",
+                     (bool)CurrentProgramInfo.TrapHandlerEnable);
+
+      // Compute registers
+      // If the front-end has set tgid_x/y/z_en - assert that the
+      // CurrentProgramInfo is consistent (usually set with function attributes
+      // amdgpu-no-workgroup-id-x etc.).
+      assert(MD->checkComputeRegisters(".tgid_x_en",
+                                       (bool)CurrentProgramInfo.TGIdXEnable));
+      assert(MD->checkComputeRegisters(".tgid_y_en",
+                                       (bool)CurrentProgramInfo.TGIdYEnable));
+      assert(MD->checkComputeRegisters(".tgid_z_en",
+                                       (bool)CurrentProgramInfo.TGIdZEnable));
+
+      // EXCPEnMSB?
+      const unsigned LdsDwGranularity = 128;
+      MD->setHwStage(CC, ".lds_size",
+                     (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
+                                sizeof(uint32_t)));
+      MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
+    } else {
+      MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+    }
   }
+
   // ScratchSize is in bytes, 16 aligned.
   MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
     unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
                                 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
                                 : CurrentProgramInfo.LDSBlocks;
-    MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
-    MD->setSpiPsInputEna(MFI->getPSInputEnable());
-    MD->setSpiPsInputAddr(MFI->getPSInputAddr());
+    if (MD->getPALMajorVersion() < 3) {
+      MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
+      MD->setSpiPsInputEna(MFI->getPSInputEnable());
+      MD->setSpiPsInputAddr(MFI->getPSInputAddr());
+    } else {
+      // Graphics registers
+      MD->setGraphicsRegisters(".ps_extra_lds_size", ExtraLDSSize);
+      // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
+      static StringLiteral const PsInputFields[] = {
+          ".persp_sample_ena",    ".persp_center_ena",
+          ".persp_centroid_ena",  ".persp_pull_model_ena",
+          ".linear_sample_ena",   ".linear_center_ena",
+          ".linear_centroid_ena", ".line_stipple_tex_ena",
+          ".pos_x_float_ena",     ".pos_y_float_ena",
+          ".pos_z_float_ena",     ".pos_w_float_ena",
+          ".front_face_ena",      ".ancillary_ena",
+          ".sample_coverage_ena", ".pos_fixed_pt_ena"};
+      unsigned PSInputEna = MFI->getPSInputEnable();
+      unsigned PSInputAddr = MFI->getPSInputAddr();
+      for (auto [Idx, Field] : enumerate(PsInputFields)) {
+        MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
+                                 (bool)((PSInputEna >> Idx) & 1));
+        MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
+                                 (bool)((PSInputAddr >> Idx) & 1));
+      }
+    }
   }
 
-  if (STM.isWave32())
+  // For version 3 and above the wave front size is already set in the metadata
+  if (MD->getPALMajorVersion() < 3 && STM.isWave32())
     MD->setWave32(MF.getFunction().getCallingConv());
 }
 
@@ -1068,7 +1128,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
   // Set compute registers
   MD->setRsrc1(CallingConv::AMDGPU_CS,
                CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
-  MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
+  MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
 
   // Set optional info
   MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize);
@@ -1104,7 +1164,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
 
   Out.compute_pgm_resource_registers =
       CurrentProgramInfo.getComputePGMRSrc1() |
-      (CurrentProgramInfo.ComputePGMRSrc2 << 32);
+      (CurrentProgramInfo.getComputePGMRSrc2() << 32);
   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
 
   if (CurrentProgramInfo.DynamicCallStack)

diff  --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 877c8b81b2c07..b6839c8308d8e 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -54,3 +54,23 @@ uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC) const {
   }
   return Reg;
 }
+
+uint64_t SIProgramInfo::getComputePGMRSrc2() const {
+  uint64_t Reg =
+      S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) |
+      S_00B84C_TRAP_HANDLER(TrapHandlerEnable) |
+      S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) |
+      S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) |
+      S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) |
+      S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) |
+      S_00B84C_EXCP_EN(EXCPEnable);
+
+  return Reg;
+}
+
+uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
+  if (AMDGPU::isCompute(CC))
+    return getComputePGMRSrc2();
+
+  return 0;
+}

diff  --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 553fb4cf496c1..aab127e494630 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -36,11 +36,23 @@ struct SIProgramInfo {
     uint32_t MemOrdered = 0; // GFX10+
     uint64_t ScratchSize = 0;
 
-    // Fields set in PGM_RSRC2 pm4 packet.
+    // State used to calculate fields set in PGM_RSRC2 pm4 packet.
     uint32_t LDSBlocks = 0;
     uint32_t ScratchBlocks = 0;
 
-    uint64_t ComputePGMRSrc2 = 0;
+    // Fields set in PGM_RSRC2 pm4 packet
+    uint32_t ScratchEnable = 0;
+    uint32_t UserSGPR = 0;
+    uint32_t TrapHandlerEnable = 0;
+    uint32_t TGIdXEnable = 0;
+    uint32_t TGIdYEnable = 0;
+    uint32_t TGIdZEnable = 0;
+    uint32_t TGSizeEnable = 0;
+    uint32_t TIdIGCompCount = 0;
+    uint32_t EXCPEnMSB = 0;
+    uint32_t LdsSize = 0;
+    uint32_t EXCPEnable = 0;
+
     uint64_t ComputePGMRSrc3GFX90A = 0;
 
     uint32_t NumVGPR = 0;
@@ -75,6 +87,10 @@ struct SIProgramInfo {
     /// Compute the value of the ComputePGMRsrc1 register.
     uint64_t getComputePGMRSrc1() const;
     uint64_t getPGMRSrc1(CallingConv::ID CC) const;
+
+    /// Compute the value of the ComputePGMRsrc2 register.
+    uint64_t getComputePGMRSrc2() const;
+    uint64_t getPGMRSrc2(CallingConv::ID CC) const;
 };
 
 } // namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 4ad93f7b0b682..a92d574b18489 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -811,6 +811,38 @@ msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunction(StringRef Name) {
   return Functions[Name].getMap(/*Convert=*/true);
 }
 
+msgpack::DocNode &AMDGPUPALMetadata::refComputeRegisters() {
+  auto &N =
+      MsgPackDoc.getRoot()
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+          .getArray(/*Convert=*/true)[0]
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".compute_registers")];
+  N.getMap(/*Convert=*/true);
+  return N;
+}
+
+msgpack::MapDocNode AMDGPUPALMetadata::getComputeRegisters() {
+  if (ComputeRegisters.isEmpty())
+    ComputeRegisters = refComputeRegisters();
+  return ComputeRegisters.getMap();
+}
+
+msgpack::DocNode &AMDGPUPALMetadata::refGraphicsRegisters() {
+  auto &N =
+      MsgPackDoc.getRoot()
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+          .getArray(/*Convert=*/true)[0]
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".graphics_registers")];
+  N.getMap(/*Convert=*/true);
+  return N;
+}
+
+msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() {
+  if (GraphicsRegisters.isEmpty())
+    GraphicsRegisters = refGraphicsRegisters();
+  return GraphicsRegisters.getMap();
+}
+
 // Return the PAL metadata hardware shader stage name.
 static const char *getStageName(CallingConv::ID CC) {
   switch (CC) {
@@ -833,15 +865,21 @@ static const char *getStageName(CallingConv::ID CC) {
   }
 }
 
+msgpack::DocNode &AMDGPUPALMetadata::refHwStage() {
+  auto &N =
+      MsgPackDoc.getRoot()
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+          .getArray(/*Convert=*/true)[0]
+          .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".hardware_stages")];
+  N.getMap(/*Convert=*/true);
+  return N;
+}
+
 // Get (create if necessary) the .hardware_stages entry for the given calling
 // convention.
 msgpack::MapDocNode AMDGPUPALMetadata::getHwStage(unsigned CC) {
   if (HwStages.isEmpty())
-    HwStages = MsgPackDoc.getRoot()
-                   .getMap(/*Convert=*/true)["amdpal.pipelines"]
-                   .getArray(/*Convert=*/true)[0]
-                   .getMap(/*Convert=*/true)[".hardware_stages"]
-                   .getMap(/*Convert=*/true);
+    HwStages = refHwStage();
   return HwStages.getMap()[getStageName(CC)].getMap(/*Convert=*/true);
 }
 
@@ -874,3 +912,78 @@ void AMDGPUPALMetadata::reset() {
   Registers = MsgPackDoc.getEmptyNode();
   HwStages = MsgPackDoc.getEmptyNode();
 }
+
+unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) {
+  assert(idx < 2 &&
+         "illegal index to PAL version - should be 0 (major) or 1 (minor)");
+  if (!VersionChecked) {
+    if (Version.isEmpty()) {
+      auto &M = MsgPackDoc.getRoot().getMap(/*Convert=*/true);
+      auto I = M.find(MsgPackDoc.getNode("amdpal.version"));
+      if (I != M.end())
+        Version = I->second;
+    }
+    VersionChecked = true;
+  }
+  if (Version.isEmpty())
+    // Default to 2.6 if there's no version info
+    return idx ? 6 : 2;
+  return Version.getArray()[idx].getUInt();
+}
+
+unsigned AMDGPUPALMetadata::getPALMajorVersion() { return getPALVersion(0); }
+
+unsigned AMDGPUPALMetadata::getPALMinorVersion() { return getPALVersion(1); }
+
+// Set the field in a given .hardware_stages entry
+void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, unsigned Val) {
+  getHwStage(CC)[field] = Val;
+}
+
+void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, bool Val) {
+  getHwStage(CC)[field] = Val;
+}
+
+void AMDGPUPALMetadata::setComputeRegisters(StringRef field, unsigned Val) {
+  getComputeRegisters()[field] = Val;
+}
+
+void AMDGPUPALMetadata::setComputeRegisters(StringRef field, bool Val) {
+  getComputeRegisters()[field] = Val;
+}
+
+msgpack::DocNode *AMDGPUPALMetadata::refComputeRegister(StringRef field) {
+  auto M = getComputeRegisters();
+  auto I = M.find(field);
+  return I == M.end() ? nullptr : &I->second;
+}
+
+bool AMDGPUPALMetadata::checkComputeRegisters(StringRef field, unsigned Val) {
+  if (auto N = refComputeRegister(field))
+    return N->getUInt() == Val;
+  return false;
+}
+
+bool AMDGPUPALMetadata::checkComputeRegisters(StringRef field, bool Val) {
+  if (auto N = refComputeRegister(field))
+    return N->getBool() == Val;
+  return false;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field, unsigned Val) {
+  getGraphicsRegisters()[field] = Val;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field, bool Val) {
+  getGraphicsRegisters()[field] = Val;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field1, StringRef field2,
+                                             unsigned Val) {
+  getGraphicsRegisters()[field1].getMap(true)[field2] = Val;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field1, StringRef field2,
+                                             bool Val) {
+  getGraphicsRegisters()[field1].getMap(true)[field2] = Val;
+}

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index a45a799e38a9b..e477904cb81f9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -27,6 +27,11 @@ class AMDGPUPALMetadata {
   msgpack::DocNode Registers;
   msgpack::DocNode HwStages;
   msgpack::DocNode ShaderFunctions;
+  bool VersionChecked = false;
+  msgpack::DocNode Version;
+  // From PAL version >= 3.0
+  msgpack::DocNode ComputeRegisters;
+  msgpack::DocNode GraphicsRegisters;
 
 public:
   // Read the amdgpu.pal.metadata supplied by the frontend, ready for
@@ -129,6 +134,26 @@ class AMDGPUPALMetadata {
   // Set legacy PAL metadata format.
   void setLegacy();
 
+  unsigned getPALMajorVersion();
+  unsigned getPALMinorVersion();
+
+  void setHwStage(unsigned CC, StringRef field, unsigned Val);
+  void setHwStage(unsigned CC, StringRef field, bool Val);
+
+  void setComputeRegisters(StringRef field, unsigned Val);
+  void setComputeRegisters(StringRef field, bool Val);
+
+  // If the field does not exist will return nullptr rather than creating a new
+  // entry (which is the behaviour of the other functions).
+  msgpack::DocNode *refComputeRegister(StringRef field);
+  bool checkComputeRegisters(StringRef field, unsigned Val);
+  bool checkComputeRegisters(StringRef field, bool Val);
+
+  void setGraphicsRegisters(StringRef field, unsigned Val);
+  void setGraphicsRegisters(StringRef field, bool Val);
+  void setGraphicsRegisters(StringRef field1, StringRef field2, unsigned Val);
+  void setGraphicsRegisters(StringRef field1, StringRef field2, bool Val);
+
   // Erase all PAL metadata.
   void reset();
 
@@ -151,10 +176,29 @@ class AMDGPUPALMetadata {
   // Get (create if necessary) a function in the shader functions map.
   msgpack::MapDocNode getShaderFunction(StringRef Name);
 
+  // Reference (create if necessary) the node for the compute_registers map.
+  msgpack::DocNode &refComputeRegisters();
+
+  // Get (create if necessary) the .compute_registers entry.
+  msgpack::MapDocNode getComputeRegisters();
+
+  // Reference (create if necessary) the node for the graphics registers map.
+  msgpack::DocNode &refGraphicsRegisters();
+
+  // Get (create if necessary) the .graphics_registers entry.
+  msgpack::MapDocNode getGraphicsRegisters();
+
+  // Reference (create if necessary) the node for the hardware_stages map.
+  msgpack::DocNode &refHwStage();
+
   // Get (create if necessary) the .hardware_stages entry for the given calling
   // convention.
   msgpack::MapDocNode getHwStage(unsigned CC);
 
+  // Get the PAL version major (idx 0) or minor (idx 1). This is an internal
+  // helper for the public wrapper functions that request Major or Minor
+  unsigned getPALVersion(unsigned idx);
+
   bool setFromLegacyBlob(StringRef Blob);
   bool setFromMsgPackBlob(StringRef Blob);
   void toLegacyBlob(std::string &Blob);

diff  --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
new file mode 100644
index 0000000000000..2b9f639f735a0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -0,0 +1,165 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s
+
+; CHECK-LABEL: {{^}}_amdgpu_cs_main:
+; CHECK: ; NumSgprs: 4
+; CHECK: ; NumVgprs: 2
+; CHECK:           .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT:   - .api:            Vulkan
+; CHECK-NEXT:     .compute_registers:
+; CHECK-NEXT:       .tg_size_en:     true
+; CHECK-NEXT:       .tgid_x_en:      false
+; CHECK-NEXT:       .tgid_y_en:      false
+; CHECK-NEXT:       .tgid_z_en:      false
+; CHECK-NEXT:       .tidig_comp_cnt: 0x1
+; CHECK-NEXT:     .graphics_registers:
+; CHECK-NEXT:      .ps_extra_lds_size: 0
+; CHECK-NEXT:      .spi_ps_input_addr:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: true
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: true
+; CHECK-NEXT:        .linear_centroid_ena: true
+; CHECK-NEXT:        .linear_sample_ena: true
+; CHECK-NEXT:        .persp_center_ena: true
+; CHECK-NEXT:        .persp_centroid_ena: true
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: true
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:      .spi_ps_input_ena:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: false
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: false
+; CHECK-NEXT:        .linear_centroid_ena: false
+; CHECK-NEXT:        .linear_sample_ena: false
+; CHECK-NEXT:        .persp_center_ena: false
+; CHECK-NEXT:        .persp_centroid_ena: false
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: false
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:    .hardware_stages:
+; CHECK-NEXT:      .cs:
+; CHECK-NEXT:        .checksum_value: 0x9444d7d0
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_cs_main
+; CHECK-NEXT:        .excp_en:        0
+; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .image_op:       false
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x4
+; CHECK-NEXT:        .sgpr_limit:     0x6a
+; CHECK-NEXT:        .threadgroup_dimensions:
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:          - 0x400
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:        .trap_present:   false
+; CHECK-NEXT:        .user_data_reg_map:
+; CHECK-NEXT:          - 0x10000000
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:        .user_sgprs:     0x3
+; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:        .vgpr_limit:     0x100
+; CHECK-NEXT:        .wavefront_size: 0x40
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .ps:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    ps_shader
+; CHECK-NEXT:        .float_mode:     0xf0
+; CHECK-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK:    .registers:      {}
+; CHECK:amdpal.version:
+; CHECK-NEXT:  - 0x3
+; CHECK-NEXT:  - 0
+; CHECK-NEXT:...
+; CHECK-NEXT:        .end_amdgpu_pal_metadata
+
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
+.entry:
+  %i = call i64 @llvm.amdgcn.s.getpc()
+  %i1 = and i64 %i, -4294967296
+  %i2 = zext i32 %arg1 to i64
+  %i3 = or i64 %i1, %i2
+  %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+  %i5 = and i32 %arg2, 1023
+  %i6 = lshr i32 %arg2, 10
+  %i7 = and i32 %i6, 1023
+  %i8 = add nuw nsw i32 %i7, %i5
+  %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+  %.idx = shl nuw nsw i32 %i8, 2
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
+  ret void
+}
+
+define dllexport amdgpu_ps void @ps_shader() #1 {
+  ret void
+}
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.amdgcn.s.getpc() #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
+
+attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode" }
+
+attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}


        


More information about the llvm-commits mailing list