[llvm] [AMDGPU] Support lowering of cluster related instrinsics (PR #157978)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 10 18:01:33 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Shilei Tian (shiltian)
<details>
<summary>Changes</summary>
Since many code are connected, this also changes how workgroup id is lowered.
Co-authored-by: Jay Foad <jay.foad@<!-- -->amd.com>
Co-authored-by: Ivan Kosarev <ivan.kosarev@<!-- -->amd.com>
---
Patch is 220.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157978.diff
21 Files Affected:
- (modified) llvm/docs/AMDGPUUsage.rst (+7)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp (+8)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h (+13-6)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+212-9)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h (+8)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+203-8)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+9)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+2-1)
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+2)
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+5)
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+13-6)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+48)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+44)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll (+1258)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll (+194)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll (+1077)
- (modified) llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll (+1-1)
- (added) llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-opt.ll (+390)
- (added) llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll (+376)
- (modified) llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll (+25-7)
- (modified) llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll (+210-4)
``````````diff
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 37563203f2f83..cef87e077cc5c 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1812,6 +1812,13 @@ The AMDGPU backend supports the following LLVM IR attributes.
offset by one less than the number of dynamic VGPR blocks required
by the function encoded in bits 5..3.
+ "amdgpu-cluster-dims"="x,y,z" Specify the cluster workgroup dimensions. A value of "0,0,0" indicates that
+ cluster is disabled. A value of "1024,1024,1024" indicates that cluster is enabled,
+ but the dimensions cannot be determined at compile time. Any other value explicitly
+ specifies the cluster dimensions.
+
+ This is only relevant on targets with cluster support.
+
================================================ ==========================================================
Calling Conventions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index d158f0f58d711..dda8033f47398 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -107,6 +107,14 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
return std::tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
&AMDGPU::SGPR_32RegClass, LLT::scalar(32));
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
+ return std::tuple(nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
case AMDGPUFunctionArgInfo::LDS_KERNEL_ID:
return std::tuple(LDSKernelId ? &LDSKernelId : nullptr,
&AMDGPU::SGPR_32RegClass, LLT::scalar(32));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index e07d47381ecca..1064e57b9da9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -111,18 +111,25 @@ struct AMDGPUFunctionArgInfo {
DISPATCH_ID = 4,
FLAT_SCRATCH_INIT = 5,
LDS_KERNEL_ID = 6, // LLVM internal, not part of the ABI
- WORKGROUP_ID_X = 10,
- WORKGROUP_ID_Y = 11,
- WORKGROUP_ID_Z = 12,
+ WORKGROUP_ID_X = 10, // Also used for cluster ID X.
+ WORKGROUP_ID_Y = 11, // Also used for cluster ID Y.
+ WORKGROUP_ID_Z = 12, // Also used for cluster ID Z.
PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
IMPLICIT_BUFFER_PTR = 15,
IMPLICIT_ARG_PTR = 16,
PRIVATE_SEGMENT_SIZE = 17,
+ CLUSTER_WORKGROUP_ID_X = 21,
+ CLUSTER_WORKGROUP_ID_Y = 22,
+ CLUSTER_WORKGROUP_ID_Z = 23,
+ CLUSTER_WORKGROUP_MAX_ID_X = 24,
+ CLUSTER_WORKGROUP_MAX_ID_Y = 25,
+ CLUSTER_WORKGROUP_MAX_ID_Z = 26,
+ CLUSTER_WORKGROUP_MAX_FLAT_ID = 27,
// VGPRS:
- WORKITEM_ID_X = 18,
- WORKITEM_ID_Y = 19,
- WORKITEM_ID_Z = 20,
+ WORKITEM_ID_X = 28,
+ WORKITEM_ID_Y = 29,
+ WORKITEM_ID_Z = 30,
FIRST_VGPR_VALUE = WORKITEM_ID_X
};
// clang-format on
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index f18536cd4ab93..b5a41e3fbf8fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4452,6 +4452,74 @@ void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
}
}
+bool AMDGPULegalizerInfo::legalizeWorkGroupId(
+ MachineInstr &MI, MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
+ AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+ AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ if (!ST.hasClusters()) {
+ if (!loadInputValue(DstReg, B, WorkGroupIdPV))
+ return false;
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Clusters are supported. Return the global position in the grid. If clusters
+ // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
+
+ // WorkGroupIdXYZ = ClusterId == 0 ?
+ // ClusterIdXYZ :
+ // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
+ MachineRegisterInfo &MRI = *B.getMRI();
+ const LLT S32 = LLT::scalar(32);
+ Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
+ Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
+ Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
+ if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
+ !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
+ !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
+ return false;
+
+ auto One = B.buildConstant(S32, 1);
+ auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
+ auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
+ B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
+
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+
+ switch (MFI->getClusterDims().getKind()) {
+ case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
+ case AMDGPU::ClusterDimsAttr::Kind::VariableDims: {
+ B.buildCopy(DstReg, GlobalIdXYZ);
+ MI.eraseFromParent();
+ return true;
+ }
+ case AMDGPU::ClusterDimsAttr::Kind::NoCluster: {
+ B.buildCopy(DstReg, ClusterIdXYZ);
+ MI.eraseFromParent();
+ return true;
+ }
+ case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
+ using namespace AMDGPU::Hwreg;
+ unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
+ Register ClusterId = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
+ B.buildInstr(AMDGPU::S_GETREG_B32_const)
+ .addDef(ClusterId)
+ .addImm(ClusterIdField);
+ auto Zero = B.buildConstant(S32, 0);
+ auto NoClusters =
+ B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
+ B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
+ MI.eraseFromParent();
+ return true;
+ }
+ }
+
+ llvm_unreachable("nothing should reach here");
+}
+
bool AMDGPULegalizerInfo::loadInputValue(
Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
@@ -4471,8 +4539,31 @@ bool AMDGPULegalizerInfo::loadInputValue(
AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+ const ArgDescriptor ClusterWorkGroupIDX =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
+ const ArgDescriptor ClusterWorkGroupIDY =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
+ const ArgDescriptor ClusterWorkGroupIDZ =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
+ const ArgDescriptor ClusterWorkGroupMaxIDX =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
+ const ArgDescriptor ClusterWorkGroupMaxIDY =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
+ const ArgDescriptor ClusterWorkGroupMaxIDZ =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
+ const ArgDescriptor ClusterWorkGroupMaxFlatID =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
+
+ auto LoadConstant = [&](unsigned N) {
+ B.buildConstant(DstReg, N);
+ return true;
+ };
+
if (ST.hasArchitectedSGPRs() &&
(AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
+ AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
+ bool HasFixedDims = ClusterDims.isFixedDims();
+
switch (ArgType) {
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
Arg = &WorkGroupIDX;
@@ -4489,6 +4580,53 @@ bool AMDGPULegalizerInfo::loadInputValue(
ArgRC = &AMDGPU::SReg_32RegClass;
ArgTy = LLT::scalar(32);
break;
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
+ if (HasFixedDims && ClusterDims.getDims()[0] == 1)
+ return LoadConstant(0);
+ Arg = &ClusterWorkGroupIDX;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
+ if (HasFixedDims && ClusterDims.getDims()[1] == 1)
+ return LoadConstant(0);
+ Arg = &ClusterWorkGroupIDY;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
+ if (HasFixedDims && ClusterDims.getDims()[2] == 1)
+ return LoadConstant(0);
+ Arg = &ClusterWorkGroupIDZ;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
+ if (HasFixedDims)
+ return LoadConstant(ClusterDims.getDims()[0] - 1);
+ Arg = &ClusterWorkGroupMaxIDX;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
+ if (HasFixedDims)
+ return LoadConstant(ClusterDims.getDims()[1] - 1);
+ Arg = &ClusterWorkGroupMaxIDY;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
+ if (HasFixedDims)
+ return LoadConstant(ClusterDims.getDims()[2] - 1);
+ Arg = &ClusterWorkGroupMaxIDZ;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
+ case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
+ Arg = &ClusterWorkGroupMaxFlatID;
+ ArgRC = &AMDGPU::SReg_32RegClass;
+ ArgTy = LLT::scalar(32);
+ break;
default:
break;
}
@@ -4499,10 +4637,9 @@ bool AMDGPULegalizerInfo::loadInputValue(
if (!Arg) {
if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
- // The intrinsic may appear when we have a 0 sized kernarg segment, in which
- // case the pointer argument may be missing and we use null.
- B.buildConstant(DstReg, 0);
- return true;
+ // The intrinsic may appear when we have a 0 sized kernarg segment, in
+ // which case the pointer argument may be missing and we use null.
+ return LoadConstant(0);
}
// It's undefined behavior if a function marked with the amdgpu-no-*
@@ -7415,6 +7552,22 @@ bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeConstHwRegRead(MachineInstr &MI,
+ MachineIRBuilder &B,
+ AMDGPU::Hwreg::Id HwReg,
+ unsigned LowBit,
+ unsigned Width) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register DstReg = MI.getOperand(0).getReg();
+ if (!MRI.getRegClassOrNull(DstReg))
+ MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
+ B.buildInstr(AMDGPU::S_GETREG_B32_const)
+ .addDef(DstReg)
+ .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
+ MI.eraseFromParent();
+ return true;
+}
+
static constexpr unsigned FPEnvModeBitField =
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
@@ -7577,14 +7730,64 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
case Intrinsic::amdgcn_workgroup_id_x:
- return legalizePreloadedArgIntrin(MI, MRI, B,
- AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+ return legalizeWorkGroupId(
+ MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
case Intrinsic::amdgcn_workgroup_id_y:
- return legalizePreloadedArgIntrin(MI, MRI, B,
- AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+ return legalizeWorkGroupId(
+ MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
case Intrinsic::amdgcn_workgroup_id_z:
- return legalizePreloadedArgIntrin(MI, MRI, B,
+ return legalizeWorkGroupId(
+ MI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_cluster_id_x:
+ return ST.hasGFX1250Insts() &&
+ legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+ case Intrinsic::amdgcn_cluster_id_y:
+ return ST.hasGFX1250Insts() &&
+ legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+ case Intrinsic::amdgcn_cluster_id_z:
+ return ST.hasGFX1250Insts() &&
+ legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_cluster_workgroup_id_x:
+ return ST.hasGFX1250Insts() &&
+ legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
+ case Intrinsic::amdgcn_cluster_workgroup_id_y:
+ return ST.hasGFX1250Insts() &&
+ legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
+ case Intrinsic::amdgcn_cluster_workgroup_id_z:
+ return ST.hasGFX1250Insts() &&
+ legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
+ case Intrinsic::amdgcn_cluster_workgroup_flat_id:
+ return AMDGPU::isGFX1250(ST) &&
+ legalizeConstHwRegRead(MI, B, AMDGPU::Hwreg::ID_IB_STS2, 21, 4);
+ case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
+ return ST.hasGFX1250Insts() &&
+ legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X);
+ case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
+ return ST.hasGFX1250Insts() &&
+ legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y);
+ case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
+ return ST.hasGFX1250Insts() &&
+ legalizePreloadedArgIntrin(
+ MI, MRI, B, AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z);
+ case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
+ return ST.hasGFX1250Insts() &&
+ legalizePreloadedArgIntrin(
+ MI, MRI, B,
+ AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID);
case Intrinsic::amdgcn_wave_id:
return legalizeWaveID(MI, B);
case Intrinsic::amdgcn_lds_kernel_id:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 1f4e02b0d600a..cd44a9ba0807c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -114,6 +114,11 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B,
const ArgDescriptor *Arg,
const TargetRegisterClass *ArgRC, LLT ArgTy) const;
+ bool legalizeWorkGroupId(
+ MachineInstr &MI, MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
+ AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+ AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
@@ -218,6 +223,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B,
+ AMDGPU::Hwreg::Id HwReg, unsigned LowBit,
+ unsigned Width) const;
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2a977247bc2cb..8e255849072c8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2400,6 +2400,53 @@ SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
return ArgValue;
}
+SDValue SITargetLowering::lowerWorkGroupId(
+ SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
+ AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
+ AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
+ AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
+ if (!Subtarget->hasClusters())
+ return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
+
+ // Clusters are supported. Return the global position in the grid. If clusters
+ // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
+
+ // WorkGroupIdXYZ = ClusterId == 0 ?
+ // ClusterIdXYZ :
+ // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
+ SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
+ SDLoc SL(ClusterIdXYZ);
+ SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
+ SDValue One = DAG.getConstant(1, SL, VT);
+ SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
+ SDValue ClusterWorkGroupIdXYZ =
+ getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
+ SDValue GlobalIdXYZ =
+ DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
+ DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
+
+ switch (MFI.getClusterDims().getKind()) {
+ case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
+ case AMDGPU::ClusterDimsAttr::Kind::VariableDims:
+ return GlobalIdXYZ;
+ case AMDGPU::ClusterDimsAttr::Kind::NoCluster:
+ return ClusterIdXYZ;
+ case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
+ using namespace AMDGPU::Hwreg;
+ SDValue ClusterIdField =
+ DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
+ SDNode *GetReg =
+ DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
+ SDValue ClusterId(GetReg, 0);
+ SDValue Zero = DAG.getConstant(0, SL, VT);
+ return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
+ GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
+ }
+ }
+
+ llvm_unreachable("nothing should reach here");
+}
+
SDValue SITargetLowering::getPreloadedValue(
SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
@@ -2418,9 +2465,30 @@ SDValue SITargetLowering::getPreloadedValue(
AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
const ArgDescriptor WorkGroupIDZ =
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+ const ArgDescriptor ClusterWorkGroupIDX =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
+ const ArgDescriptor ClusterWorkGroupIDY =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
+ const ArgDescriptor ClusterWorkGroupIDZ =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
+ const ArgDescriptor ClusterWorkGroupMaxIDX =
+ ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
+ const...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/157978
More information about the llvm-commits
mailing list