[llvm] [AMDGPU][Draft] OOB mode - module flag (PR #160922)

via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 26 09:41:27 PDT 2025


github-actions[bot] wrote:

<!--LLVM CODE FORMAT COMMENT: {clang-format}-->


:warning: C/C++ code formatter, clang-format found issues in your code. :warning:

<details>
<summary>
You can test this locally with the following command:
</summary>

``````````bash
git-clang-format --diff origin/main HEAD --extensions h,cpp -- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/lib/Target/AMDGPU/GCNSubtarget.h
``````````

:warning:
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing `origin/main` to the base branch/commit you want to compare against.
:warning:

</details>

<details>
<summary>
View the diff from clang-format here.
</summary>

``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index cf50a0c6d..2b5616e70 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -296,9 +296,9 @@ protected:
   // Setting a bit enables a relaxed mode that disables strict OOB guarantees;
   // an out-of-bounds access may cause a neighboring in-bounds access to be
   // treated as OOB.
-  // If bit is set, enable relaxed mode. 0 in a bit keeps the corresponding check strict.
-  // OOBMode{0} - untyped buffers (buffer_load)
-  // OOBMode{1} - typed buffers (tbuffer_load)
+  // If bit is set, enable relaxed mode. 0 in a bit keeps the corresponding
+  // check strict. OOBMode{0} - untyped buffers (buffer_load) OOBMode{1} - typed
+  // buffers (tbuffer_load)
   unsigned OOBMode = 0;
 
 private:
@@ -656,1220 +656,1048 @@ public:
     return UnalignedAccessMode;
   }
 
-  bool hasRelaxedBufferOOBMode() const { return OOBMode == 1; // TODO: Use named const/enum.}
-  void setOOBMode(unsigned val) { OOBMode = val; }
+  bool hasRelaxedBufferOOBMode() const {
+    return OOBMode == 1; // TODO: Use named const/enum.}
+    void setOOBMode(unsigned val) { OOBMode = val; }
 
-  bool hasApertureRegs() const {
-    return HasApertureRegs;
-  }
-
-  bool isTrapHandlerEnabled() const {
-    return TrapHandler;
-  }
-
-  bool isXNACKEnabled() const {
-    return TargetID.isXnackOnOrAny();
-  }
-
-  bool isTgSplitEnabled() const {
-    return EnableTgSplit;
-  }
-
-  bool isCuModeEnabled() const {
-    return EnableCuMode;
-  }
+    bool hasApertureRegs() const { return HasApertureRegs; }
 
-  bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
-
-  bool hasFlatAddressSpace() const {
-    return FlatAddressSpace;
-  }
-
-  bool hasFlatScrRegister() const {
-    return hasFlatAddressSpace();
-  }
-
-  bool hasFlatInstOffsets() const {
-    return FlatInstOffsets;
-  }
-
-  bool hasFlatGlobalInsts() const {
-    return FlatGlobalInsts;
-  }
+    bool isTrapHandlerEnabled() const { return TrapHandler; }
 
-  bool hasFlatScratchInsts() const {
-    return FlatScratchInsts;
-  }
+    bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
 
-  // Check if target supports ST addressing mode with FLAT scratch instructions.
-  // The ST addressing mode means no registers are used, either VGPR or SGPR,
-  // but only immediate offset is swizzled and added to the FLAT scratch base.
-  bool hasFlatScratchSTMode() const {
-    return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
-  }
+    bool isTgSplitEnabled() const { return EnableTgSplit; }
 
-  bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
+    bool isCuModeEnabled() const { return EnableCuMode; }
 
-  bool hasScalarFlatScratchInsts() const {
-    return ScalarFlatScratchInsts;
-  }
+    bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
 
-  bool enableFlatScratch() const {
-    return flatScratchIsArchitected() ||
-           (EnableFlatScratch && hasFlatScratchInsts());
-  }
+    bool hasFlatAddressSpace() const { return FlatAddressSpace; }
 
-  bool hasGlobalAddTidInsts() const {
-    return GFX10_BEncoding;
-  }
+    bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
 
-  bool hasAtomicCSub() const {
-    return GFX10_BEncoding;
-  }
+    bool hasFlatInstOffsets() const { return FlatInstOffsets; }
 
-  bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
+    bool hasFlatGlobalInsts() const { return FlatGlobalInsts; }
 
-  bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
+    bool hasFlatScratchInsts() const { return FlatScratchInsts; }
 
-  bool hasExportInsts() const {
-    return !hasGFX940Insts() && !hasGFX1250Insts();
-  }
+    // Check if target supports ST addressing mode with FLAT scratch
+    // instructions. The ST addressing mode means no registers are used, either
+    // VGPR or SGPR, but only immediate offset is swizzled and added to the FLAT
+    // scratch base.
+    bool hasFlatScratchSTMode() const {
+      return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
+    }
 
-  bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
+    bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
 
-  // DS_ADD_F64/DS_ADD_RTN_F64
-  bool hasLdsAtomicAddF64() const {
-    return hasGFX90AInsts() || hasGFX1250Insts();
-  }
+    bool hasScalarFlatScratchInsts() const { return ScalarFlatScratchInsts; }
 
-  bool hasMultiDwordFlatScratchAddressing() const {
-    return getGeneration() >= GFX9;
-  }
+    bool enableFlatScratch() const {
+      return flatScratchIsArchitected() ||
+             (EnableFlatScratch && hasFlatScratchInsts());
+    }
 
-  bool hasFlatSegmentOffsetBug() const {
-    return HasFlatSegmentOffsetBug;
-  }
+    bool hasGlobalAddTidInsts() const { return GFX10_BEncoding; }
 
-  bool hasFlatLgkmVMemCountInOrder() const {
-    return getGeneration() > GFX9;
-  }
+    bool hasAtomicCSub() const { return GFX10_BEncoding; }
 
-  bool hasD16LoadStore() const {
-    return getGeneration() >= GFX9;
-  }
+    bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
 
-  bool d16PreservesUnusedBits() const {
-    return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
-  }
+    bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
 
-  bool hasD16Images() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
+    bool hasExportInsts() const {
+      return !hasGFX940Insts() && !hasGFX1250Insts();
+    }
 
-  /// Return if most LDS instructions have an m0 use that require m0 to be
-  /// initialized.
-  bool ldsRequiresM0Init() const {
-    return getGeneration() < GFX9;
-  }
+    bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
 
-  // True if the hardware rewinds and replays GWS operations if a wave is
-  // preempted.
-  //
-  // If this is false, a GWS operation requires testing if a nack set the
-  // MEM_VIOL bit, and repeating if so.
-  bool hasGWSAutoReplay() const {
-    return getGeneration() >= GFX9;
-  }
-
-  /// \returns if target has ds_gws_sema_release_all instruction.
-  bool hasGWSSemaReleaseAll() const {
-    return CIInsts;
-  }
-
-  /// \returns true if the target has integer add/sub instructions that do not
-  /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
-  /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
-  /// for saturation.
-  bool hasAddNoCarry() const {
-    return AddNoCarryInsts;
-  }
+    // DS_ADD_F64/DS_ADD_RTN_F64
+    bool hasLdsAtomicAddF64() const {
+      return hasGFX90AInsts() || hasGFX1250Insts();
+    }
 
-  bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
+    bool hasMultiDwordFlatScratchAddressing() const {
+      return getGeneration() >= GFX9;
+    }
 
-  bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
+    bool hasFlatSegmentOffsetBug() const { return HasFlatSegmentOffsetBug; }
 
-  bool hasUnpackedD16VMem() const {
-    return HasUnpackedD16VMem;
-  }
+    bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
 
-  // Covers VS/PS/CS graphics shaders
-  bool isMesaGfxShader(const Function &F) const {
-    return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
-  }
+    bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
 
-  bool hasMad64_32() const {
-    return getGeneration() >= SEA_ISLANDS;
-  }
+    bool d16PreservesUnusedBits() const {
+      return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
+    }
 
-  bool hasSDWAOmod() const {
-    return HasSDWAOmod;
-  }
+    bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
 
-  bool hasSDWAScalar() const {
-    return HasSDWAScalar;
-  }
+    /// Return if most LDS instructions have an m0 use that require m0 to be
+    /// initialized.
+    bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
 
-  bool hasSDWASdst() const {
-    return HasSDWASdst;
-  }
+    // True if the hardware rewinds and replays GWS operations if a wave is
+    // preempted.
+    //
+    // If this is false, a GWS operation requires testing if a nack set the
+    // MEM_VIOL bit, and repeating if so.
+    bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
 
-  bool hasSDWAMac() const {
-    return HasSDWAMac;
-  }
+    /// \returns if target has ds_gws_sema_release_all instruction.
+    bool hasGWSSemaReleaseAll() const { return CIInsts; }
 
-  bool hasSDWAOutModsVOPC() const {
-    return HasSDWAOutModsVOPC;
-  }
+    /// \returns true if the target has integer add/sub instructions that do not
+    /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
+    /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
+    /// for saturation.
+    bool hasAddNoCarry() const { return AddNoCarryInsts; }
 
-  bool hasDLInsts() const {
-    return HasDLInsts;
-  }
+    bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
 
-  bool hasFmacF64Inst() const { return HasFmacF64Inst; }
+    bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
 
-  bool hasDot1Insts() const {
-    return HasDot1Insts;
-  }
+    bool hasUnpackedD16VMem() const { return HasUnpackedD16VMem; }
 
-  bool hasDot2Insts() const {
-    return HasDot2Insts;
-  }
+    // Covers VS/PS/CS graphics shaders
+    bool isMesaGfxShader(const Function &F) const {
+      return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
+    }
 
-  bool hasDot3Insts() const {
-    return HasDot3Insts;
-  }
+    bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
 
-  bool hasDot4Insts() const {
-    return HasDot4Insts;
-  }
+    bool hasSDWAOmod() const { return HasSDWAOmod; }
 
-  bool hasDot5Insts() const {
-    return HasDot5Insts;
-  }
+    bool hasSDWAScalar() const { return HasSDWAScalar; }
 
-  bool hasDot6Insts() const {
-    return HasDot6Insts;
-  }
+    bool hasSDWASdst() const { return HasSDWASdst; }
 
-  bool hasDot7Insts() const {
-    return HasDot7Insts;
-  }
+    bool hasSDWAMac() const { return HasSDWAMac; }
 
-  bool hasDot8Insts() const {
-    return HasDot8Insts;
-  }
+    bool hasSDWAOutModsVOPC() const { return HasSDWAOutModsVOPC; }
 
-  bool hasDot9Insts() const {
-    return HasDot9Insts;
-  }
+    bool hasDLInsts() const { return HasDLInsts; }
 
-  bool hasDot10Insts() const {
-    return HasDot10Insts;
-  }
+    bool hasFmacF64Inst() const { return HasFmacF64Inst; }
 
-  bool hasDot11Insts() const {
-    return HasDot11Insts;
-  }
+    bool hasDot1Insts() const { return HasDot1Insts; }
 
-  bool hasDot12Insts() const {
-    return HasDot12Insts;
-  }
+    bool hasDot2Insts() const { return HasDot2Insts; }
 
-  bool hasDot13Insts() const {
-    return HasDot13Insts;
-  }
+    bool hasDot3Insts() const { return HasDot3Insts; }
 
-  bool hasMAIInsts() const {
-    return HasMAIInsts;
-  }
+    bool hasDot4Insts() const { return HasDot4Insts; }
 
-  bool hasFP8Insts() const {
-    return HasFP8Insts;
-  }
+    bool hasDot5Insts() const { return HasDot5Insts; }
 
-  bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
+    bool hasDot6Insts() const { return HasDot6Insts; }
 
-  bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }
+    bool hasDot7Insts() const { return HasDot7Insts; }
 
-  bool hasPkFmacF16Inst() const {
-    return HasPkFmacF16Inst;
-  }
+    bool hasDot8Insts() const { return HasDot8Insts; }
 
-  bool hasAtomicFMinFMaxF32GlobalInsts() const {
-    return HasAtomicFMinFMaxF32GlobalInsts;
-  }
+    bool hasDot9Insts() const { return HasDot9Insts; }
 
-  bool hasAtomicFMinFMaxF64GlobalInsts() const {
-    return HasAtomicFMinFMaxF64GlobalInsts;
-  }
+    bool hasDot10Insts() const { return HasDot10Insts; }
 
-  bool hasAtomicFMinFMaxF32FlatInsts() const {
-    return HasAtomicFMinFMaxF32FlatInsts;
-  }
+    bool hasDot11Insts() const { return HasDot11Insts; }
 
-  bool hasAtomicFMinFMaxF64FlatInsts() const {
-    return HasAtomicFMinFMaxF64FlatInsts;
-  }
+    bool hasDot12Insts() const { return HasDot12Insts; }
 
-  bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
+    bool hasDot13Insts() const { return HasDot13Insts; }
 
-  bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
+    bool hasMAIInsts() const { return HasMAIInsts; }
 
-  bool hasAtomicFaddInsts() const {
-    return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
-  }
+    bool hasFP8Insts() const { return HasFP8Insts; }
 
-  bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
+    bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
 
-  bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
+    bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }
 
-  bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
-    return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
-  }
+    bool hasPkFmacF16Inst() const { return HasPkFmacF16Inst; }
 
-  bool hasAtomicBufferGlobalPkAddF16Insts() const {
-    return HasAtomicBufferGlobalPkAddF16Insts;
-  }
+    bool hasAtomicFMinFMaxF32GlobalInsts() const {
+      return HasAtomicFMinFMaxF32GlobalInsts;
+    }
 
-  bool hasAtomicGlobalPkAddBF16Inst() const {
-    return HasAtomicGlobalPkAddBF16Inst;
-  }
+    bool hasAtomicFMinFMaxF64GlobalInsts() const {
+      return HasAtomicFMinFMaxF64GlobalInsts;
+    }
 
-  bool hasAtomicBufferPkAddBF16Inst() const {
-    return HasAtomicBufferPkAddBF16Inst;
-  }
+    bool hasAtomicFMinFMaxF32FlatInsts() const {
+      return HasAtomicFMinFMaxF32FlatInsts;
+    }
 
-  bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
+    bool hasAtomicFMinFMaxF64FlatInsts() const {
+      return HasAtomicFMinFMaxF64FlatInsts;
+    }
 
-  /// \return true if the target has flat, global, and buffer atomic fadd for
-  /// double.
-  bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
-    return HasFlatBufferGlobalAtomicFaddF64Inst;
-  }
+    bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
 
-  /// \return true if the target's flat, global, and buffer atomic fadd for
-  /// float supports denormal handling.
-  bool hasMemoryAtomicFaddF32DenormalSupport() const {
-    return HasMemoryAtomicFaddF32DenormalSupport;
-  }
+    bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
 
-  /// \return true if atomic operations targeting fine-grained memory work
-  /// correctly at device scope, in allocations in host or peer PCIe device
-  /// memory.
-  bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
-    return HasAgentScopeFineGrainedRemoteMemoryAtomics;
-  }
+    bool hasAtomicFaddInsts() const {
+      return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
+    }
 
-  /// \return true is HW emulates system scope atomics unsupported by the PCI-e
-  /// via CAS loop.
-  bool hasEmulatedSystemScopeAtomics() const {
-    return HasEmulatedSystemScopeAtomics;
-  }
+    bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
 
-  bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
+    bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
 
-  bool hasDefaultComponentBroadcast() const {
-    return HasDefaultComponentBroadcast;
-  }
+    bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
+      return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
+    }
 
-  bool hasNoSdstCMPX() const {
-    return HasNoSdstCMPX;
-  }
+    bool hasAtomicBufferGlobalPkAddF16Insts() const {
+      return HasAtomicBufferGlobalPkAddF16Insts;
+    }
 
-  bool hasVscnt() const {
-    return HasVscnt;
-  }
+    bool hasAtomicGlobalPkAddBF16Inst() const {
+      return HasAtomicGlobalPkAddBF16Inst;
+    }
 
-  bool hasGetWaveIdInst() const {
-    return HasGetWaveIdInst;
-  }
+    bool hasAtomicBufferPkAddBF16Inst() const {
+      return HasAtomicBufferPkAddBF16Inst;
+    }
 
-  bool hasSMemTimeInst() const {
-    return HasSMemTimeInst;
-  }
+    bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
-  bool hasShaderCyclesRegister() const {
-    return HasShaderCyclesRegister;
-  }
+    /// \return true if the target has flat, global, and buffer atomic fadd for
+    /// double.
+    bool hasFlatBufferGlobalAtomicFaddF64Inst() const {
+      return HasFlatBufferGlobalAtomicFaddF64Inst;
+    }
 
-  bool hasShaderCyclesHiLoRegisters() const {
-    return HasShaderCyclesHiLoRegisters;
-  }
+    /// \return true if the target's flat, global, and buffer atomic fadd for
+    /// float supports denormal handling.
+    bool hasMemoryAtomicFaddF32DenormalSupport() const {
+      return HasMemoryAtomicFaddF32DenormalSupport;
+    }
 
-  bool hasVOP3Literal() const {
-    return HasVOP3Literal;
-  }
+    /// \return true if atomic operations targeting fine-grained memory work
+    /// correctly at device scope, in allocations in host or peer PCIe device
+    /// memory.
+    bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const {
+      return HasAgentScopeFineGrainedRemoteMemoryAtomics;
+    }
 
-  bool hasNoDataDepHazard() const {
-    return HasNoDataDepHazard;
-  }
+    /// \return true is HW emulates system scope atomics unsupported by the
+    /// PCI-e via CAS loop.
+    bool hasEmulatedSystemScopeAtomics() const {
+      return HasEmulatedSystemScopeAtomics;
+    }
 
-  bool vmemWriteNeedsExpWaitcnt() const {
-    return getGeneration() < SEA_ISLANDS;
-  }
+    bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
 
-  bool hasInstPrefetch() const {
-    return getGeneration() == GFX10 || getGeneration() == GFX11;
-  }
+    bool hasDefaultComponentBroadcast() const {
+      return HasDefaultComponentBroadcast;
+    }
 
-  bool hasPrefetch() const { return GFX12Insts; }
+    bool hasNoSdstCMPX() const { return HasNoSdstCMPX; }
 
-  bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
+    bool hasVscnt() const { return HasVscnt; }
 
-  bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
+    bool hasGetWaveIdInst() const { return HasGetWaveIdInst; }
 
-  bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
+    bool hasSMemTimeInst() const { return HasSMemTimeInst; }
 
-  // Has s_cmpk_* instructions.
-  bool hasSCmpK() const { return getGeneration() < GFX12; }
+    bool hasShaderCyclesRegister() const { return HasShaderCyclesRegister; }
 
-  // Scratch is allocated in 256 dword per wave blocks for the entire
-  // wavefront. When viewed from the perspective of an arbitrary workitem, this
-  // is 4-byte aligned.
-  //
-  // Only 4-byte alignment is really needed to access anything. Transformations
-  // on the pointer value itself may rely on the alignment / known low bits of
-  // the pointer. Set this to something above the minimum to avoid needing
-  // dynamic realignment in common cases.
-  Align getStackAlignment() const { return Align(16); }
+    bool hasShaderCyclesHiLoRegisters() const {
+      return HasShaderCyclesHiLoRegisters;
+    }
 
-  bool enableMachineScheduler() const override {
-    return true;
-  }
+    bool hasVOP3Literal() const { return HasVOP3Literal; }
 
-  bool useAA() const override;
+    bool hasNoDataDepHazard() const { return HasNoDataDepHazard; }
 
-  bool enableSubRegLiveness() const override {
-    return true;
-  }
+    bool vmemWriteNeedsExpWaitcnt() const {
+      return getGeneration() < SEA_ISLANDS;
+    }
 
-  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
-  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
+    bool hasInstPrefetch() const {
+      return getGeneration() == GFX10 || getGeneration() == GFX11;
+    }
 
-  // static wrappers
-  static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
+    bool hasPrefetch() const { return GFX12Insts; }
 
-  // XXX - Why is this here if it isn't in the default pass set?
-  bool enableEarlyIfConversion() const override {
-    return true;
-  }
+    bool hasVmemPrefInsts() const { return HasVmemPrefInsts; }
 
-  void overrideSchedPolicy(MachineSchedPolicy &Policy,
-                           const SchedRegion &Region) const override;
+    bool hasSafeSmemPrefetch() const { return HasSafeSmemPrefetch; }
 
-  void overridePostRASchedPolicy(MachineSchedPolicy &Policy,
-                                 const SchedRegion &Region) const override;
+    bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; }
 
-  void mirFileLoaded(MachineFunction &MF) const override;
+    // Has s_cmpk_* instructions.
+    bool hasSCmpK() const { return getGeneration() < GFX12; }
 
-  unsigned getMaxNumUserSGPRs() const {
-    return AMDGPU::getMaxNumUserSGPRs(*this);
-  }
+    // Scratch is allocated in 256 dword per wave blocks for the entire
+    // wavefront. When viewed from the perspective of an arbitrary workitem,
+    // this is 4-byte aligned.
+    //
+    // Only 4-byte alignment is really needed to access anything.
+    // Transformations on the pointer value itself may rely on the alignment /
+    // known low bits of the pointer. Set this to something above the minimum to
+    // avoid needing dynamic realignment in common cases.
+    Align getStackAlignment() const { return Align(16); }
 
-  bool hasSMemRealTime() const {
-    return HasSMemRealTime;
-  }
+    bool enableMachineScheduler() const override { return true; }
 
-  bool hasMovrel() const {
-    return HasMovrel;
-  }
+    bool useAA() const override;
 
-  bool hasVGPRIndexMode() const {
-    return HasVGPRIndexMode;
-  }
+    bool enableSubRegLiveness() const override { return true; }
 
-  bool useVGPRIndexMode() const;
+    void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
+    bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
 
-  bool hasScalarCompareEq64() const {
-    return getGeneration() >= VOLCANIC_ISLANDS;
-  }
+    // static wrappers
+    static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
 
-  bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
+    // XXX - Why is this here if it isn't in the default pass set?
+    bool enableEarlyIfConversion() const override { return true; }
 
-  bool hasScalarStores() const {
-    return HasScalarStores;
-  }
+    void overrideSchedPolicy(MachineSchedPolicy & Policy,
+                             const SchedRegion &Region) const override;
 
-  bool hasScalarAtomics() const {
-    return HasScalarAtomics;
-  }
+    void overridePostRASchedPolicy(MachineSchedPolicy & Policy,
+                                   const SchedRegion &Region) const override;
 
-  bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
-  bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; }
+    void mirFileLoaded(MachineFunction & MF) const override;
 
-  /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
-  bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
+    unsigned getMaxNumUserSGPRs() const {
+      return AMDGPU::getMaxNumUserSGPRs(*this);
+    }
 
-  /// \returns true if the subtarget has the v_permlane64_b32 instruction.
-  bool hasPermLane64() const { return getGeneration() >= GFX11; }
+    bool hasSMemRealTime() const { return HasSMemRealTime; }
 
-  bool hasDPP() const {
-    return HasDPP;
-  }
+    bool hasMovrel() const { return HasMovrel; }
 
-  bool hasDPPBroadcasts() const {
-    return HasDPP && getGeneration() < GFX10;
-  }
+    bool hasVGPRIndexMode() const { return HasVGPRIndexMode; }
 
-  bool hasDPPWavefrontShifts() const {
-    return HasDPP && getGeneration() < GFX10;
-  }
+    bool useVGPRIndexMode() const;
 
-  bool hasDPP8() const {
-    return HasDPP8;
-  }
+    bool hasScalarCompareEq64() const {
+      return getGeneration() >= VOLCANIC_ISLANDS;
+    }
 
-  bool hasDPALU_DPP() const {
-    return HasDPALU_DPP;
-  }
+    bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
 
-  bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
+    bool hasScalarStores() const { return HasScalarStores; }
 
-  bool hasPackedFP32Ops() const {
-    return HasPackedFP32Ops;
-  }
+    bool hasScalarAtomics() const { return HasScalarAtomics; }
 
-  // Has V_PK_MOV_B32 opcode
-  bool hasPkMovB32() const {
-    return GFX90AInsts;
-  }
+    bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
+    bool hasLDSFPAtomicAddF64() const { return GFX90AInsts || GFX1250Insts; }
 
-  bool hasFmaakFmamkF32Insts() const {
-    return getGeneration() >= GFX10 || hasGFX940Insts();
-  }
+    /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
+    bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
 
-  bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
+    /// \returns true if the subtarget has the v_permlane64_b32 instruction.
+    bool hasPermLane64() const { return getGeneration() >= GFX11; }
 
-  bool hasImageInsts() const {
-    return HasImageInsts;
-  }
+    bool hasDPP() const { return HasDPP; }
 
-  bool hasExtendedImageInsts() const {
-    return HasExtendedImageInsts;
-  }
+    bool hasDPPBroadcasts() const { return HasDPP && getGeneration() < GFX10; }
 
-  bool hasR128A16() const {
-    return HasR128A16;
-  }
+    bool hasDPPWavefrontShifts() const {
+      return HasDPP && getGeneration() < GFX10;
+    }
 
-  bool hasA16() const { return HasA16; }
+    bool hasDPP8() const { return HasDPP8; }
 
-  bool hasG16() const { return HasG16; }
+    bool hasDPALU_DPP() const { return HasDPALU_DPP; }
 
-  bool hasOffset3fBug() const {
-    return HasOffset3fBug;
-  }
+    bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
 
-  bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
+    bool hasPackedFP32Ops() const { return HasPackedFP32Ops; }
 
-  bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
+    // Has V_PK_MOV_B32 opcode
+    bool hasPkMovB32() const { return GFX90AInsts; }
 
-  bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
+    bool hasFmaakFmamkF32Insts() const {
+      return getGeneration() >= GFX10 || hasGFX940Insts();
+    }
 
-  bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
+    bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
 
-  bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
+    bool hasImageInsts() const { return HasImageInsts; }
 
-  bool hasNSAEncoding() const { return HasNSAEncoding; }
+    bool hasExtendedImageInsts() const { return HasExtendedImageInsts; }
 
-  bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
+    bool hasR128A16() const { return HasR128A16; }
 
-  bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
+    bool hasA16() const { return HasA16; }
 
-  unsigned getNSAMaxSize(bool HasSampler = false) const {
-    return AMDGPU::getNSAMaxSize(*this, HasSampler);
-  }
+    bool hasG16() const { return HasG16; }
 
-  bool hasGFX10_AEncoding() const {
-    return GFX10_AEncoding;
-  }
+    bool hasOffset3fBug() const { return HasOffset3fBug; }
 
-  bool hasGFX10_BEncoding() const {
-    return GFX10_BEncoding;
-  }
+    bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
 
-  bool hasGFX10_3Insts() const {
-    return GFX10_3Insts;
-  }
+    bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
 
-  bool hasMadF16() const;
+    bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
 
-  bool hasMovB64() const { return GFX940Insts || GFX1250Insts; }
+    bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
 
-  bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
+    bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
 
-  // Scalar and global loads support scale_offset bit.
-  bool hasScaleOffset() const { return GFX1250Insts; }
+    bool hasNSAEncoding() const { return HasNSAEncoding; }
 
-  bool hasFlatGVSMode() const { return FlatGVSMode; }
+    bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
 
-  // FLAT GLOBAL VOffset is signed
-  bool hasSignedGVSOffset() const { return GFX1250Insts; }
+    bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
 
-  bool enableSIScheduler() const {
-    return EnableSIScheduler;
-  }
+    unsigned getNSAMaxSize(bool HasSampler = false) const {
+      return AMDGPU::getNSAMaxSize(*this, HasSampler);
+    }
 
-  bool loadStoreOptEnabled() const {
-    return EnableLoadStoreOpt;
-  }
+    bool hasGFX10_AEncoding() const { return GFX10_AEncoding; }
 
-  bool hasSGPRInitBug() const {
-    return SGPRInitBug;
-  }
+    bool hasGFX10_BEncoding() const { return GFX10_BEncoding; }
 
-  bool hasUserSGPRInit16Bug() const {
-    return UserSGPRInit16Bug && isWave32();
-  }
+    bool hasGFX10_3Insts() const { return GFX10_3Insts; }
 
-  bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
+    bool hasMadF16() const;
 
-  bool hasNegativeUnalignedScratchOffsetBug() const {
-    return NegativeUnalignedScratchOffsetBug;
-  }
+    bool hasMovB64() const { return GFX940Insts || GFX1250Insts; }
 
-  bool hasMFMAInlineLiteralBug() const {
-    return HasMFMAInlineLiteralBug;
-  }
+    bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; }
 
-  bool has12DWordStoreHazard() const {
-    return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
-  }
+    // Scalar and global loads support scale_offset bit.
+    bool hasScaleOffset() const { return GFX1250Insts; }
 
-  // \returns true if the subtarget supports DWORDX3 load/store instructions.
-  bool hasDwordx3LoadStores() const {
-    return CIInsts;
-  }
+    bool hasFlatGVSMode() const { return FlatGVSMode; }
 
-  bool hasReadM0MovRelInterpHazard() const {
-    return getGeneration() == AMDGPUSubtarget::GFX9;
-  }
+    // FLAT GLOBAL VOffset is signed
+    bool hasSignedGVSOffset() const { return GFX1250Insts; }
 
-  bool hasReadM0SendMsgHazard() const {
-    return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
-           getGeneration() <= AMDGPUSubtarget::GFX9;
-  }
+    bool enableSIScheduler() const { return EnableSIScheduler; }
 
-  bool hasReadM0LdsDmaHazard() const {
-    return getGeneration() == AMDGPUSubtarget::GFX9;
-  }
+    bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; }
 
-  bool hasReadM0LdsDirectHazard() const {
-    return getGeneration() == AMDGPUSubtarget::GFX9;
-  }
+    bool hasSGPRInitBug() const { return SGPRInitBug; }
 
-  bool hasVcmpxPermlaneHazard() const {
-    return HasVcmpxPermlaneHazard;
-  }
+    bool hasUserSGPRInit16Bug() const {
+      return UserSGPRInit16Bug && isWave32();
+    }
 
-  bool hasVMEMtoScalarWriteHazard() const {
-    return HasVMEMtoScalarWriteHazard;
-  }
+    bool hasNegativeScratchOffsetBug() const {
+      return NegativeScratchOffsetBug;
+    }
 
-  bool hasSMEMtoVectorWriteHazard() const {
-    return HasSMEMtoVectorWriteHazard;
-  }
+    bool hasNegativeUnalignedScratchOffsetBug() const {
+      return NegativeUnalignedScratchOffsetBug;
+    }
 
-  bool hasLDSMisalignedBug() const {
-    return LDSMisalignedBug && !EnableCuMode;
-  }
+    bool hasMFMAInlineLiteralBug() const { return HasMFMAInlineLiteralBug; }
 
-  bool hasInstFwdPrefetchBug() const {
-    return HasInstFwdPrefetchBug;
-  }
+    bool has12DWordStoreHazard() const {
+      return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
+    }
 
-  bool hasVcmpxExecWARHazard() const {
-    return HasVcmpxExecWARHazard;
-  }
+    // \returns true if the subtarget supports DWORDX3 load/store instructions.
+    bool hasDwordx3LoadStores() const { return CIInsts; }
 
-  bool hasLdsBranchVmemWARHazard() const {
-    return HasLdsBranchVmemWARHazard;
-  }
+    bool hasReadM0MovRelInterpHazard() const {
+      return getGeneration() == AMDGPUSubtarget::GFX9;
+    }
 
-  // Shift amount of a 64 bit shift cannot be a highest allocated register
-  // if also at the end of the allocation block.
-  bool hasShift64HighRegBug() const {
-    return GFX90AInsts && !GFX940Insts;
-  }
+    bool hasReadM0SendMsgHazard() const {
+      return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+             getGeneration() <= AMDGPUSubtarget::GFX9;
+    }
 
-  // Has one cycle hazard on transcendental instruction feeding a
-  // non transcendental VALU.
-  bool hasTransForwardingHazard() const { return GFX940Insts; }
+    bool hasReadM0LdsDmaHazard() const {
+      return getGeneration() == AMDGPUSubtarget::GFX9;
+    }
 
-  // Has one cycle hazard on a VALU instruction partially writing dst with
-  // a shift of result bits feeding another VALU instruction.
-  bool hasDstSelForwardingHazard() const { return GFX940Insts; }
+    bool hasReadM0LdsDirectHazard() const {
+      return getGeneration() == AMDGPUSubtarget::GFX9;
+    }
 
-  // Cannot use op_sel with v_dot instructions.
-  bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
+    bool hasVcmpxPermlaneHazard() const { return HasVcmpxPermlaneHazard; }
 
-  // Does not have HW interlocs for VALU writing and then reading SGPRs.
-  bool hasVDecCoExecHazard() const {
-    return GFX940Insts;
-  }
+    bool hasVMEMtoScalarWriteHazard() const {
+      return HasVMEMtoScalarWriteHazard;
+    }
 
-  bool hasNSAtoVMEMBug() const {
-    return HasNSAtoVMEMBug;
-  }
+    bool hasSMEMtoVectorWriteHazard() const {
+      return HasSMEMtoVectorWriteHazard;
+    }
 
-  bool hasNSAClauseBug() const { return HasNSAClauseBug; }
+    bool hasLDSMisalignedBug() const {
+      return LDSMisalignedBug && !EnableCuMode;
+    }
 
-  bool hasHardClauses() const { return MaxHardClauseLength > 0; }
+    bool hasInstFwdPrefetchBug() const { return HasInstFwdPrefetchBug; }
 
-  bool hasGFX90AInsts() const { return GFX90AInsts; }
+    bool hasVcmpxExecWARHazard() const { return HasVcmpxExecWARHazard; }
 
-  bool hasFPAtomicToDenormModeHazard() const {
-    return getGeneration() == GFX10;
-  }
+    bool hasLdsBranchVmemWARHazard() const { return HasLdsBranchVmemWARHazard; }
 
-  bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
+    // Shift amount of a 64 bit shift cannot be a highest allocated register
+    // if also at the end of the allocation block.
+    bool hasShift64HighRegBug() const { return GFX90AInsts && !GFX940Insts; }
 
-  bool hasLdsDirect() const { return getGeneration() >= GFX11; }
+    // Has one cycle hazard on transcendental instruction feeding a
+    // non transcendental VALU.
+    bool hasTransForwardingHazard() const { return GFX940Insts; }
 
-  bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
+    // Has one cycle hazard on a VALU instruction partially writing dst with
+    // a shift of result bits feeding another VALU instruction.
+    bool hasDstSelForwardingHazard() const { return GFX940Insts; }
 
-  bool hasVALUPartialForwardingHazard() const {
-    return getGeneration() == GFX11;
-  }
+    // Cannot use op_sel with v_dot instructions.
+    bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
 
-  bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
+    // Does not have HW interlocs for VALU writing and then reading SGPRs.
+    bool hasVDecCoExecHazard() const { return GFX940Insts; }
 
-  bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
+    bool hasNSAtoVMEMBug() const { return HasNSAtoVMEMBug; }
 
-  bool requiresCodeObjectV6() const { return RequiresCOV6; }
+    bool hasNSAClauseBug() const { return HasNSAClauseBug; }
 
-  bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
+    bool hasHardClauses() const { return MaxHardClauseLength > 0; }
 
-  bool hasGloballyAddressableScratch() const {
-    return HasGloballyAddressableScratch;
-  }
+    bool hasGFX90AInsts() const { return GFX90AInsts; }
 
-  bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
+    bool hasFPAtomicToDenormModeHazard() const {
+      return getGeneration() == GFX10;
+    }
 
-  bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
+    bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
 
-  bool setRegModeNeedsVNOPs() const {
-    return GFX1250Insts && getGeneration() == GFX12;
-  }
+    bool hasLdsDirect() const { return getGeneration() >= GFX11; }
 
-  /// Return if operations acting on VGPR tuples require even alignment.
-  bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
+    bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
 
-  /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
-  bool hasSPackHL() const { return GFX11Insts; }
+    bool hasVALUPartialForwardingHazard() const {
+      return getGeneration() == GFX11;
+    }
 
-  /// Return true if the target's EXP instruction has the COMPR flag, which
-  /// affects the meaning of the EN (enable) bits.
-  bool hasCompressedExport() const { return !GFX11Insts; }
+    bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
 
-  /// Return true if the target's EXP instruction supports the NULL export
-  /// target.
-  bool hasNullExportTarget() const { return !GFX11Insts; }
+    bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
 
-  bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
+    bool requiresCodeObjectV6() const { return RequiresCOV6; }
 
-  bool hasVOPDInsts() const { return HasVOPDInsts; }
+    bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
 
-  bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
+    bool hasGloballyAddressableScratch() const {
+      return HasGloballyAddressableScratch;
+    }
 
-  /// Return true if the target has the S_DELAY_ALU instruction.
-  bool hasDelayAlu() const { return GFX11Insts; }
+    bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
 
-  bool hasPackedTID() const { return HasPackedTID; }
+    bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; }
 
-  // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that
-  // hasGFX90AInsts is also true.
-  bool hasGFX940Insts() const { return GFX940Insts; }
+    bool setRegModeNeedsVNOPs() const {
+      return GFX1250Insts && getGeneration() == GFX12;
+    }
 
-  // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that
-  // hasGFX940Insts and hasGFX90AInsts are also true.
-  bool hasGFX950Insts() const { return GFX950Insts; }
+    /// Return if operations acting on VGPR tuples require even alignment.
+    bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
 
-  /// Returns true if the target supports
-  /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
-  /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
-  bool hasLDSLoadB96_B128() const {
-    return hasGFX950Insts();
-  }
+    /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
+    bool hasSPackHL() const { return GFX11Insts; }
 
-  bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }
+    /// Return true if the target's EXP instruction has the COMPR flag, which
+    /// affects the meaning of the EN (enable) bits.
+    bool hasCompressedExport() const { return !GFX11Insts; }
 
-  bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
+    /// Return true if the target's EXP instruction supports the NULL export
+    /// target.
+    bool hasNullExportTarget() const { return !GFX11Insts; }
 
-  bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
+    bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
 
-  bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
+    bool hasVOPDInsts() const { return HasVOPDInsts; }
 
-  bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
+    bool hasFlatScratchSVSSwizzleBug() const {
+      return getGeneration() == GFX11;
+    }
 
-  bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
+    /// Return true if the target has the S_DELAY_ALU instruction.
+    bool hasDelayAlu() const { return GFX11Insts; }
 
-  /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
-  /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
-  bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
+    bool hasPackedTID() const { return HasPackedTID; }
 
-  /// \returns true if inline constants are not supported for F16 pseudo
-  /// scalar transcendentals.
-  bool hasNoF16PseudoScalarTransInlineConstants() const {
-    return getGeneration() == GFX12;
-  }
+    // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies
+    // that hasGFX90AInsts is also true.
+    bool hasGFX940Insts() const { return GFX940Insts; }
 
-  /// \returns true if the target has instructions with xf32 format support.
-  bool hasXF32Insts() const { return HasXF32Insts; }
+    // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that
+    // hasGFX940Insts and hasGFX90AInsts are also true.
+    bool hasGFX950Insts() const { return GFX950Insts; }
 
-  bool hasBitOp3Insts() const { return HasBitOp3Insts; }
+    /// Returns true if the target supports
+    /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
+    /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
+    bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
 
-  bool hasPermlane16Swap() const { return HasPermlane16Swap; }
-  bool hasPermlane32Swap() const { return HasPermlane32Swap; }
-  bool hasAshrPkInsts() const { return HasAshrPkInsts; }
+    bool hasVMemToLDSLoad() const { return HasVMemToLDSLoad; }
 
-  bool hasMinimum3Maximum3F32() const {
-    return HasMinimum3Maximum3F32;
-  }
+    bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
 
-  bool hasMinimum3Maximum3F16() const {
-    return HasMinimum3Maximum3F16;
-  }
+    bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
 
-  bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
+    bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
 
-  bool hasTanhInsts() const { return HasTanhInsts; }
+    bool hasRequiredExportPriority() const { return HasRequiredExportPriority; }
 
-  bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }
+    bool hasVmemWriteVgprInOrder() const { return HasVmemWriteVgprInOrder; }
 
-  bool hasAddPC64Inst() const { return GFX1250Insts; }
+    /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
+    /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
+    bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
 
-  bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; }
+    /// \returns true if inline constants are not supported for F16 pseudo
+    /// scalar transcendentals.
+    bool hasNoF16PseudoScalarTransInlineConstants() const {
+      return getGeneration() == GFX12;
+    }
 
-  bool hasMinimum3Maximum3PKF16() const {
-    return HasMinimum3Maximum3PKF16;
-  }
+    /// \returns true if the target has instructions with xf32 format support.
+    bool hasXF32Insts() const { return HasXF32Insts; }
 
-  bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
+    bool hasBitOp3Insts() const { return HasBitOp3Insts; }
 
-  /// \returns true if the target has s_wait_xcnt insertion. Supported for
-  /// GFX1250.
-  bool hasWaitXCnt() const { return HasWaitXcnt; }
+    bool hasPermlane16Swap() const { return HasPermlane16Swap; }
+    bool hasPermlane32Swap() const { return HasPermlane32Swap; }
+    bool hasAshrPkInsts() const { return HasAshrPkInsts; }
 
-  // A single DWORD instructions can use a 64-bit literal.
-  bool has64BitLiterals() const { return Has64BitLiterals; }
+    bool hasMinimum3Maximum3F32() const { return HasMinimum3Maximum3F32; }
 
-  bool hasPointSampleAccel() const { return HasPointSampleAccel; }
+    bool hasMinimum3Maximum3F16() const { return HasMinimum3Maximum3F16; }
 
-  bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }
+    bool hasMin3Max3PKF16() const { return HasMin3Max3PKF16; }
 
-  /// \returns The maximum number of instructions that can be enclosed in an
-  /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
-  /// instruction.
-  unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
+    bool hasTanhInsts() const { return HasTanhInsts; }
 
-  bool hasPrngInst() const { return HasPrngInst; }
+    bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; }
 
-  bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; }
+    bool hasAddPC64Inst() const { return GFX1250Insts; }
 
-  /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
-  /// SGPRs
-  unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
+    bool has1024AddressableVGPRs() const { return Has1024AddressableVGPRs; }
 
-  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
-  /// VGPRs
-  unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
-                                    unsigned DynamicVGPRBlockSize) const;
+    bool hasMinimum3Maximum3PKF16() const { return HasMinimum3Maximum3PKF16; }
 
-  /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
-  /// be achieved when the only function running on a CU is \p F, each workgroup
-  /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
-  /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
-  /// range, so this returns a range as well.
-  ///
-  /// Note that occupancy can be affected by the scratch allocation as well, but
-  /// we do not have enough information to compute it.
-  std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
-                                                 unsigned LDSSize = 0,
-                                                 unsigned NumSGPRs = 0,
-                                                 unsigned NumVGPRs = 0) const;
+    bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
 
-  /// \returns true if the flat_scratch register should be initialized with the
-  /// pointer to the wave's scratch memory rather than a size and offset.
-  bool flatScratchIsPointer() const {
-    return getGeneration() >= AMDGPUSubtarget::GFX9;
-  }
+    /// \returns true if the target has s_wait_xcnt insertion. Supported for
+    /// GFX1250.
+    bool hasWaitXCnt() const { return HasWaitXcnt; }
 
-  /// \returns true if the flat_scratch register is initialized by the HW.
-  /// In this case it is readonly.
-  bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
+    // A single DWORD instructions can use a 64-bit literal.
+    bool has64BitLiterals() const { return Has64BitLiterals; }
 
-  /// \returns true if the architected SGPRs are enabled.
-  bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
+    bool hasPointSampleAccel() const { return HasPointSampleAccel; }
 
-  /// \returns true if Global Data Share is supported.
-  bool hasGDS() const { return HasGDS; }
+    bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }
 
-  /// \returns true if Global Wave Sync is supported.
-  bool hasGWS() const { return HasGWS; }
+    /// \returns The maximum number of instructions that can be enclosed in an
+    /// S_CLAUSE on the given subtarget, or 0 for targets that do not support
+    /// that instruction.
+    unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
 
-  /// \returns true if the machine has merged shaders in which s0-s7 are
-  /// reserved by the hardware and user SGPRs start at s8
-  bool hasMergedShaders() const {
-    return getGeneration() >= GFX9;
-  }
+    bool hasPrngInst() const { return HasPrngInst; }
 
-  // \returns true if the target supports the pre-NGG legacy geometry path.
-  bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
+    bool hasBVHDualAndBVH8Insts() const { return HasBVHDualAndBVH8Insts; }
 
-  // \returns true if preloading kernel arguments is supported.
-  bool hasKernargPreload() const { return KernargPreload; }
+    /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
+    /// SGPRs
+    unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
 
-  // \returns true if the target has split barriers feature
-  bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
+    /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
+    /// VGPRs
+    unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
+                                      unsigned DynamicVGPRBlockSize) const;
 
-  // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
-  bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }
+    /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that
+    /// can be achieved when the only function running on a CU is \p F, each
+    /// workgroup uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs
+    /// SGPRs and \p NumVGPRs VGPRs. The flat workgroup sizes associated to the
+    /// function are a range, so this returns a range as well.
+    ///
+    /// Note that occupancy can be affected by the scratch allocation as well,
+    /// but we do not have enough information to compute it.
+    std::pair<unsigned, unsigned> computeOccupancy(
+        const Function &F, unsigned LDSSize = 0, unsigned NumSGPRs = 0,
+        unsigned NumVGPRs = 0) const;
 
-  // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
-  // no-return form.
-  bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
+    /// \returns true if the flat_scratch register should be initialized with
+    /// the pointer to the wave's scratch memory rather than a size and offset.
+    bool flatScratchIsPointer() const {
+      return getGeneration() >= AMDGPUSubtarget::GFX9;
+    }
 
-  // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
-  bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
+    /// \returns true if the flat_scratch register is initialized by the HW.
+    /// In this case it is readonly.
+    bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
 
-  // \returns true if the target has IEEE kernel descriptor mode bit
-  bool hasIEEEMode() const { return getGeneration() < GFX12; }
+    /// \returns true if the architected SGPRs are enabled.
+    bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
 
-  // \returns true if the target has IEEE fminimum/fmaximum instructions
-  bool hasIEEEMinimumMaximumInsts() const { return HasIEEEMinimumMaximumInsts; }
+    /// \returns true if Global Data Share is supported.
+    bool hasGDS() const { return HasGDS; }
 
-  // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
-  bool hasRrWGMode() const { return getGeneration() >= GFX12; }
+    /// \returns true if Global Wave Sync is supported.
+    bool hasGWS() const { return HasGWS; }
 
-  /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
-  /// values.
-  bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
+    /// \returns true if the machine has merged shaders in which s0-s7 are
+    /// reserved by the hardware and user SGPRs start at s8
+    bool hasMergedShaders() const { return getGeneration() >= GFX9; }
 
-  bool hasGFX1250Insts() const { return GFX1250Insts; }
+    // \returns true if the target supports the pre-NGG legacy geometry path.
+    bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
 
-  bool hasVOPD3() const { return GFX1250Insts; }
+    // \returns true if preloading kernel arguments is supported.
+    bool hasKernargPreload() const { return KernargPreload; }
 
-  // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
-  bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
+    // \returns true if the target has split barriers feature
+    bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
 
-  // \returns true if the target has V_MAD_U32 instruction.
-  bool hasMadU32Inst() const { return HasMadU32Inst; }
+    // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
+    bool hasCvtFP8VOP1Bug() const { return HasCvtFP8Vop1Bug; }
 
-  // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
-  bool hasVectorMulU64() const { return GFX1250Insts; }
+    // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
+    // no-return form.
+    bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
 
-  // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
-  // instructions.
-  bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
+    // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
+    bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
 
-  // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
-  bool hasIntMinMax64() const { return GFX1250Insts; }
+    // \returns true if the target has IEEE kernel descriptor mode bit
+    bool hasIEEEMode() const { return getGeneration() < GFX12; }
 
-  // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
-  bool hasAddMinMaxInsts() const { return GFX1250Insts; }
+    // \returns true if the target has IEEE fminimum/fmaximum instructions
+    bool hasIEEEMinimumMaximumInsts() const {
+      return HasIEEEMinimumMaximumInsts;
+    }
 
-  // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
-  bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
+    // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
+    bool hasRrWGMode() const { return getGeneration() >= GFX12; }
 
-  // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
-  bool hasPkMinMax3Insts() const { return GFX1250Insts; }
+    /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
+    /// values.
+    bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
 
-  // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
-  bool hasSGetShaderCyclesInst() const { return GFX1250Insts; }
+    bool hasGFX1250Insts() const { return GFX1250Insts; }
 
-  // \returns true if target has S_SETPRIO_INC_WG instruction.
-  bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
+    bool hasVOPD3() const { return GFX1250Insts; }
 
-  // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
-  // of sign-extending. Note that GFX1250 has not only fixed the bug but also
-  // extended VA to 57 bits.
-  bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
+    // \returns true if the target has V_ADD_U64/V_SUB_U64 instructions.
+    bool hasAddSubU64Insts() const { return HasAddSubU64Insts; }
 
-  // \returns true if the target needs to create a prolog for backward
-  // compatibility when preloading kernel arguments.
-  bool needsKernArgPreloadProlog() const {
-    return hasKernargPreload() && !GFX1250Insts;
-  }
+    // \returns true if the target has V_MAD_U32 instruction.
+    bool hasMadU32Inst() const { return HasMadU32Inst; }
 
-  /// \returns SGPR allocation granularity supported by the subtarget.
-  unsigned getSGPRAllocGranule() const {
-    return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
-  }
+    // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
+    bool hasVectorMulU64() const { return GFX1250Insts; }
 
-  /// \returns SGPR encoding granularity supported by the subtarget.
-  unsigned getSGPREncodingGranule() const {
-    return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
-  }
+    // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
+    // instructions.
+    bool hasMadU64U32NoCarry() const { return GFX1250Insts; }
 
-  /// \returns Total number of SGPRs supported by the subtarget.
-  unsigned getTotalNumSGPRs() const {
-    return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
-  }
+    // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
+    bool hasIntMinMax64() const { return GFX1250Insts; }
 
-  /// \returns Addressable number of SGPRs supported by the subtarget.
-  unsigned getAddressableNumSGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
-  }
+    // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
+    bool hasAddMinMaxInsts() const { return GFX1250Insts; }
 
-  /// \returns Minimum number of SGPRs that meets the given number of waves per
-  /// execution unit requirement supported by the subtarget.
-  unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
-  }
+    // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
+    bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
 
-  /// \returns Maximum number of SGPRs that meets the given number of waves per
-  /// execution unit requirement supported by the subtarget.
-  unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
-    return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
-  }
+    // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
+    bool hasPkMinMax3Insts() const { return GFX1250Insts; }
 
-  /// \returns Reserved number of SGPRs. This is common
-  /// utility function called by MachineFunction and
-  /// Function variants of getReservedNumSGPRs.
-  unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
-  /// \returns Reserved number of SGPRs for given machine function \p MF.
-  unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+    // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
+    bool hasSGetShaderCyclesInst() const { return GFX1250Insts; }
 
-  /// \returns Reserved number of SGPRs for given function \p F.
-  unsigned getReservedNumSGPRs(const Function &F) const;
+    // \returns true if target has S_SETPRIO_INC_WG instruction.
+    bool hasSetPrioIncWgInst() const { return HasSetPrioIncWgInst; }
 
-  /// \returns Maximum number of preloaded SGPRs for the subtarget.
-  unsigned getMaxNumPreloadedSGPRs() const;
+    // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
+    // of sign-extending. Note that GFX1250 has not only fixed the bug but also
+    // extended VA to 57 bits.
+    bool hasGetPCZeroExtension() const { return GFX12Insts && !GFX1250Insts; }
 
-  /// \returns max num SGPRs. This is the common utility
-  /// function called by MachineFunction and Function
-  /// variants of getMaxNumSGPRs.
-  unsigned getBaseMaxNumSGPRs(const Function &F,
-                              std::pair<unsigned, unsigned> WavesPerEU,
-                              unsigned PreloadedSGPRs,
-                              unsigned ReservedNumSGPRs) const;
+    // \returns true if the target needs to create a prolog for backward
+    // compatibility when preloading kernel arguments.
+    bool needsKernArgPreloadProlog() const {
+      return hasKernargPreload() && !GFX1250Insts;
+    }
 
-  /// \returns Maximum number of SGPRs that meets number of waves per execution
-  /// unit requirement for function \p MF, or number of SGPRs explicitly
-  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+    /// \returns SGPR allocation granularity supported by the subtarget.
+    unsigned getSGPRAllocGranule() const {
+      return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
+    }
 
-  /// \returns Maximum number of SGPRs that meets number of waves per execution
-  /// unit requirement for function \p F, or number of SGPRs explicitly
-  /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumSGPRs(const Function &F) const;
+    /// \returns SGPR encoding granularity supported by the subtarget.
+    unsigned getSGPREncodingGranule() const {
+      return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
+    }
 
-  /// \returns VGPR allocation granularity supported by the subtarget.
-  unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
-    return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
-  }
+    /// \returns Total number of SGPRs supported by the subtarget.
+    unsigned getTotalNumSGPRs() const {
+      return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
+    }
 
-  /// \returns VGPR encoding granularity supported by the subtarget.
-  unsigned getVGPREncodingGranule() const {
-    return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
-  }
+    /// \returns Addressable number of SGPRs supported by the subtarget.
+    unsigned getAddressableNumSGPRs() const {
+      return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
+    }
 
-  /// \returns Total number of VGPRs supported by the subtarget.
-  unsigned getTotalNumVGPRs() const {
-    return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
-  }
+    /// \returns Minimum number of SGPRs that meets the given number of waves
+    /// per execution unit requirement supported by the subtarget.
+    unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
+      return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
+    }
 
-  /// \returns Addressable number of architectural VGPRs supported by the
-  /// subtarget.
-  unsigned getAddressableNumArchVGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
-  }
+    /// \returns Maximum number of SGPRs that meets the given number of waves
+    /// per execution unit requirement supported by the subtarget.
+    unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
+      return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
+    }
 
-  /// \returns Addressable number of VGPRs supported by the subtarget.
-  unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
-    return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
-  }
+    /// \returns Reserved number of SGPRs. This is common
+    /// utility function called by MachineFunction and
+    /// Function variants of getReservedNumSGPRs.
+    unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
+    /// \returns Reserved number of SGPRs for given machine function \p MF.
+    unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+
+    /// \returns Reserved number of SGPRs for given function \p F.
+    unsigned getReservedNumSGPRs(const Function &F) const;
+
+    /// \returns Maximum number of preloaded SGPRs for the subtarget.
+    unsigned getMaxNumPreloadedSGPRs() const;
+
+    /// \returns max num SGPRs. This is the common utility
+    /// function called by MachineFunction and Function
+    /// variants of getMaxNumSGPRs.
+    unsigned getBaseMaxNumSGPRs(
+        const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
+        unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const;
+
+    /// \returns Maximum number of SGPRs that meets number of waves per
+    /// execution unit requirement for function \p MF, or number of SGPRs
+    /// explicitly requested using "amdgpu-num-sgpr" attribute attached to
+    /// function \p MF.
+    ///
+    /// \returns Value that meets number of waves per execution unit requirement
+    /// if explicitly requested value cannot be converted to integer, violates
+    /// subtarget's specifications, or does not meet number of waves per
+    /// execution unit requirement.
+    unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+
+    /// \returns Maximum number of SGPRs that meets number of waves per
+    /// execution unit requirement for function \p F, or number of SGPRs
+    /// explicitly requested using "amdgpu-num-sgpr" attribute attached to
+    /// function \p F.
+    ///
+    /// \returns Value that meets number of waves per execution unit requirement
+    /// if explicitly requested value cannot be converted to integer, violates
+    /// subtarget's specifications, or does not meet number of waves per
+    /// execution unit requirement.
+    unsigned getMaxNumSGPRs(const Function &F) const;
+
+    /// \returns VGPR allocation granularity supported by the subtarget.
+    unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
+      return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
+    }
 
-  /// \returns the minimum number of VGPRs that will prevent achieving more than
-  /// the specified number of waves \p WavesPerEU.
-  unsigned getMinNumVGPRs(unsigned WavesPerEU,
-                          unsigned DynamicVGPRBlockSize) const {
-    return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
-                                           DynamicVGPRBlockSize);
-  }
+    /// \returns VGPR encoding granularity supported by the subtarget.
+    unsigned getVGPREncodingGranule() const {
+      return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
+    }
 
-  /// \returns the maximum number of VGPRs that can be used and still achieved
-  /// at least the specified number of waves \p WavesPerEU.
-  unsigned getMaxNumVGPRs(unsigned WavesPerEU,
-                          unsigned DynamicVGPRBlockSize) const {
-    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
-                                           DynamicVGPRBlockSize);
-  }
+    /// \returns Total number of VGPRs supported by the subtarget.
+    unsigned getTotalNumVGPRs() const {
+      return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
+    }
 
-  /// \returns max num VGPRs. This is the common utility function
-  /// called by MachineFunction and Function variants of getMaxNumVGPRs.
-  unsigned
-  getBaseMaxNumVGPRs(const Function &F,
-                     std::pair<unsigned, unsigned> NumVGPRBounds) const;
+    /// \returns Addressable number of architectural VGPRs supported by the
+    /// subtarget.
+    unsigned getAddressableNumArchVGPRs() const {
+      return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
+    }
 
-  /// \returns Maximum number of VGPRs that meets number of waves per execution
-  /// unit requirement for function \p F, or number of VGPRs explicitly
-  /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumVGPRs(const Function &F) const;
+    /// \returns Addressable number of VGPRs supported by the subtarget.
+    unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
+      return AMDGPU::IsaInfo::getAddressableNumVGPRs(this,
+                                                     DynamicVGPRBlockSize);
+    }
 
-  unsigned getMaxNumAGPRs(const Function &F) const {
-    return getMaxNumVGPRs(F);
-  }
+    /// \returns the minimum number of VGPRs that will prevent achieving more
+    /// than the specified number of waves \p WavesPerEU.
+    unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
+        const {
+      return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
+                                             DynamicVGPRBlockSize);
+    }
 
-  /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
-  /// of waves per execution unit required for the function \p MF.
-  std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
+    /// \returns the maximum number of VGPRs that can be used and still achieved
+    /// at least the specified number of waves \p WavesPerEU.
+    unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
+        const {
+      return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
+                                             DynamicVGPRBlockSize);
+    }
 
-  /// \returns Maximum number of VGPRs that meets number of waves per execution
-  /// unit requirement for function \p MF, or number of VGPRs explicitly
-  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+    /// \returns max num VGPRs. This is the common utility function
+    /// called by MachineFunction and Function variants of getMaxNumVGPRs.
+    unsigned getBaseMaxNumVGPRs(
+        const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const;
+
+    /// \returns Maximum number of VGPRs that meets number of waves per
+    /// execution unit requirement for function \p F, or number of VGPRs
+    /// explicitly requested using "amdgpu-num-vgpr" attribute attached to
+    /// function \p F.
+    ///
+    /// \returns Value that meets number of waves per execution unit requirement
+    /// if explicitly requested value cannot be converted to integer, violates
+    /// subtarget's specifications, or does not meet number of waves per
+    /// execution unit requirement.
+    unsigned getMaxNumVGPRs(const Function &F) const;
+
+    unsigned getMaxNumAGPRs(const Function &F) const {
+      return getMaxNumVGPRs(F);
+    }
 
-  bool supportsWave32() const { return getGeneration() >= GFX10; }
+    /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
+    /// of waves per execution unit required for the function \p MF.
+    std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
 
-  bool supportsWave64() const { return !hasGFX1250Insts(); }
+    /// \returns Maximum number of VGPRs that meets number of waves per
+    /// execution unit requirement for function \p MF, or number of VGPRs
+    /// explicitly requested using "amdgpu-num-vgpr" attribute attached to
+    /// function \p MF.
+    ///
+    /// \returns Value that meets number of waves per execution unit requirement
+    /// if explicitly requested value cannot be converted to integer, violates
+    /// subtarget's specifications, or does not meet number of waves per
+    /// execution unit requirement.
+    unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 
-  bool isWave32() const {
-    return getWavefrontSize() == 32;
-  }
+    bool supportsWave32() const { return getGeneration() >= GFX10; }
 
-  bool isWave64() const {
-    return getWavefrontSize() == 64;
-  }
+    bool supportsWave64() const { return !hasGFX1250Insts(); }
 
-  /// Returns if the wavesize of this subtarget is known reliable. This is false
-  /// only for the a default target-cpu that does not have an explicit
-  /// +wavefrontsize target feature.
-  bool isWaveSizeKnown() const {
-    return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
-           hasFeature(AMDGPU::FeatureWavefrontSize64);
-  }
+    bool isWave32() const { return getWavefrontSize() == 32; }
 
-  const TargetRegisterClass *getBoolRC() const {
-    return getRegisterInfo()->getBoolRC();
-  }
+    bool isWave64() const { return getWavefrontSize() == 64; }
 
-  /// \returns Maximum number of work groups per compute unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
-    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
-  }
+    /// Returns if the wavesize of this subtarget is known reliable. This is
+    /// false only for the a default target-cpu that does not have an explicit
+    /// +wavefrontsize target feature.
+    bool isWaveSizeKnown() const {
+      return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
+             hasFeature(AMDGPU::FeatureWavefrontSize64);
+    }
 
-  /// \returns Minimum flat work group size supported by the subtarget.
-  unsigned getMinFlatWorkGroupSize() const override {
-    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
-  }
+    const TargetRegisterClass *getBoolRC() const {
+      return getRegisterInfo()->getBoolRC();
+    }
 
-  /// \returns Maximum flat work group size supported by the subtarget.
-  unsigned getMaxFlatWorkGroupSize() const override {
-    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
-  }
+    /// \returns Maximum number of work groups per compute unit supported by the
+    /// subtarget and limited by given \p FlatWorkGroupSize.
+    unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
+      return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
+    }
 
-  /// \returns Number of waves per execution unit required to support the given
-  /// \p FlatWorkGroupSize.
-  unsigned
-  getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
-    return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
-  }
+    /// \returns Minimum flat work group size supported by the subtarget.
+    unsigned getMinFlatWorkGroupSize() const override {
+      return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
+    }
 
-  /// \returns Minimum number of waves per execution unit supported by the
-  /// subtarget.
-  unsigned getMinWavesPerEU() const override {
-    return AMDGPU::IsaInfo::getMinWavesPerEU(this);
-  }
+    /// \returns Maximum flat work group size supported by the subtarget.
+    unsigned getMaxFlatWorkGroupSize() const override {
+      return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
+    }
 
-  void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
-                             SDep &Dep,
-                             const TargetSchedModel *SchedModel) const override;
+    /// \returns Number of waves per execution unit required to support the
+    /// given
+    /// \p FlatWorkGroupSize.
+    unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)
+        const override {
+      return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this,
+                                                        FlatWorkGroupSize);
+    }
 
-  // \returns true if it's beneficial on this subtarget for the scheduler to
-  // cluster stores as well as loads.
-  bool shouldClusterStores() const { return getGeneration() >= GFX11; }
+    /// \returns Minimum number of waves per execution unit supported by the
+    /// subtarget.
+    unsigned getMinWavesPerEU() const override {
+      return AMDGPU::IsaInfo::getMinWavesPerEU(this);
+    }
 
-  // \returns the number of address arguments from which to enable MIMG NSA
-  // on supported architectures.
-  unsigned getNSAThreshold(const MachineFunction &MF) const;
+    void adjustSchedDependency(
+        SUnit * Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
+        const TargetSchedModel *SchedModel) const override;
 
-  // \returns true if the subtarget has a hazard requiring an "s_nop 0"
-  // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
-  bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; }
+    // \returns true if it's beneficial on this subtarget for the scheduler to
+    // cluster stores as well as loads.
+    bool shouldClusterStores() const { return getGeneration() >= GFX11; }
 
-  // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
-  // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
-  bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; }
+    // \returns the number of address arguments from which to enable MIMG NSA
+    // on supported architectures.
+    unsigned getNSAThreshold(const MachineFunction &MF) const;
 
-  bool isDynamicVGPREnabled() const { return DynamicVGPR; }
-  unsigned getDynamicVGPRBlockSize() const {
-    return DynamicVGPRBlockSize32 ? 32 : 16;
-  }
+    // \returns true if the subtarget has a hazard requiring an "s_nop 0"
+    // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
+    bool requiresNopBeforeDeallocVGPRs() const { return !GFX1250Insts; }
 
-  bool requiresDisjointEarlyClobberAndUndef() const override {
-    // AMDGPU doesn't care if early-clobber and undef operands are allocated
-    // to the same register.
-    return false;
-  }
+    // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
+    // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
+    bool requiresWaitIdleBeforeGetReg() const { return GFX1250Insts; }
 
-  // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
-  // and surronded by S_WAIT_ALU(0xFFE3).
-  bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
-    return getGeneration() == GFX12;
-  }
+    bool isDynamicVGPREnabled() const { return DynamicVGPR; }
+    unsigned getDynamicVGPRBlockSize() const {
+      return DynamicVGPRBlockSize32 ? 32 : 16;
+    }
 
-  // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
-  // read.
-  bool hasScratchBaseForwardingHazard() const {
-    return GFX1250Insts && getGeneration() == GFX12;
-  }
+    bool requiresDisjointEarlyClobberAndUndef() const override {
+      // AMDGPU doesn't care if early-clobber and undef operands are allocated
+      // to the same register.
+      return false;
+    }
 
-  /// \returns true if the subtarget supports clusters of workgroups.
-  bool hasClusters() const { return GFX1250Insts; }
+    // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
+    // and surronded by S_WAIT_ALU(0xFFE3).
+    bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const {
+      return getGeneration() == GFX12;
+    }
 
-  /// \returns true if the subtarget requires a wait for xcnt before atomic
-  /// flat/global stores & rmw.
-  bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+    // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
+    // read.
+    bool hasScratchBaseForwardingHazard() const {
+      return GFX1250Insts && getGeneration() == GFX12;
+    }
 
-  /// \returns the number of significant bits in the immediate field of the
-  /// S_NOP instruction.
-  unsigned getSNopBits() const {
-    if (getGeneration() >= AMDGPUSubtarget::GFX12)
-      return 7;
-    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-      return 4;
-    return 3;
-  }
+    /// \returns true if the subtarget supports clusters of workgroups.
+    bool hasClusters() const { return GFX1250Insts; }
+
+    /// \returns true if the subtarget requires a wait for xcnt before atomic
+    /// flat/global stores & rmw.
+    bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
+
+    /// \returns the number of significant bits in the immediate field of the
+    /// S_NOP instruction.
+    unsigned getSNopBits() const {
+      if (getGeneration() >= AMDGPUSubtarget::GFX12)
+        return 7;
+      if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+        return 4;
+      return 3;
+    }
 
-  /// \returns true if the sub-target supports buffer resource (V#) with 45-bit
-  /// num_records.
-  bool has45BitNumRecordsBufferResource() const {
-    return Has45BitNumRecordsBufferResource;
-  }
-};
+    /// \returns true if the sub-target supports buffer resource (V#) with
+    /// 45-bit num_records.
+    bool has45BitNumRecordsBufferResource() const {
+      return Has45BitNumRecordsBufferResource;
+    }
+  };
 
 class GCNUserSGPRUsageInfo {
 public:

``````````

</details>


https://github.com/llvm/llvm-project/pull/160922


More information about the llvm-commits mailing list