[clang] clang/AMDGPU: Set amdgpu-max-num-workgroups to disable Y/Z by default (PR #119009)
Matt Arsenault via cfe-commits
cfe-commits at lists.llvm.org
Fri Dec 6 13:34:33 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/119009
>From 7c9c6c6897c74df7427c20b0e1305efaa2028c8c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 6 Dec 2024 13:21:41 -0500
Subject: [PATCH 1/2] clang/AMDGPU: Set amdgpu-max-num-workgroups to disable
Y/Z by default
Only OpenCL supports 2d and 3d dispatches, the other languages Y and Z
dimensions are always 1. Some of the generated OpenMP functions don't
seem to get the correct attributes. The kernels do, but the callable
__omp_offloading functions are missing them for some reason.
---
clang/include/clang/Basic/LangOptions.h | 4 ++
clang/lib/CodeGen/Targets/AMDGPU.cpp | 56 +++++++++++--------
clang/test/CodeGenHIP/default-attributes.hip | 4 +-
clang/test/OpenMP/amdgcn-attributes.cpp | 12 ++--
.../amdgcn_target_global_constructor.cpp | 2 +-
5 files changed, 45 insertions(+), 33 deletions(-)
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 949c8f5d448bcf..d5532eec0a683e 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -706,6 +706,10 @@ class LangOptions : public LangOptionsBase {
return OpenCL || CUDA;
}
+ /// Return true if the dispatch size for an offload language only uses one
+ /// dimension.
+ bool gridSizeIsOneDimension() const { return CUDA || HIP || OpenMP; }
+
/// Return the OpenCL C or C++ version as a VersionTuple.
VersionTuple getOpenCLVersionTuple() const;
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 56ad0503a11ab2..904e03b3cc7182 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -377,29 +377,6 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
if (NumVGPR != 0)
F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
}
-
- if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
- uint32_t X = Attr->getMaxNumWorkGroupsX()
- ->EvaluateKnownConstInt(M.getContext())
- .getExtValue();
- // Y and Z dimensions default to 1 if not specified
- uint32_t Y = Attr->getMaxNumWorkGroupsY()
- ? Attr->getMaxNumWorkGroupsY()
- ->EvaluateKnownConstInt(M.getContext())
- .getExtValue()
- : 1;
- uint32_t Z = Attr->getMaxNumWorkGroupsZ()
- ? Attr->getMaxNumWorkGroupsZ()
- ->EvaluateKnownConstInt(M.getContext())
- .getExtValue()
- : 1;
-
- llvm::SmallString<32> AttrVal;
- llvm::raw_svector_ostream OS(AttrVal);
- OS << X << ',' << Y << ',' << Z;
-
- F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
- }
}
/// Emits control constants used to change per-architecture behaviour in the
@@ -450,9 +427,40 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
if (!F)
return;
+ // TODO: Use AttrBuilder
const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
- if (FD)
+ const AMDGPUMaxNumWorkGroupsAttr *MaxNumWorkGroupsAttr = nullptr;
+ if (FD) {
setFunctionDeclAttributes(FD, F, M);
+ MaxNumWorkGroupsAttr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>();
+ }
+
+ if (MaxNumWorkGroupsAttr) {
+ const auto *Attr = MaxNumWorkGroupsAttr;
+ uint32_t X = Attr->getMaxNumWorkGroupsX()
+ ->EvaluateKnownConstInt(M.getContext())
+ .getExtValue();
+ // Y and Z dimensions default to 1 if not specified
+ uint32_t Y = Attr->getMaxNumWorkGroupsY()
+ ? Attr->getMaxNumWorkGroupsY()
+ ->EvaluateKnownConstInt(M.getContext())
+ .getExtValue()
+ : 1;
+ uint32_t Z = Attr->getMaxNumWorkGroupsZ()
+ ? Attr->getMaxNumWorkGroupsZ()
+ ->EvaluateKnownConstInt(M.getContext())
+ .getExtValue()
+ : 1;
+
+ llvm::SmallString<32> AttrVal;
+ llvm::raw_svector_ostream OS(AttrVal);
+ OS << X << ',' << Y << ',' << Z;
+
+ F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
+ } else if (M.getLangOpts().gridSizeIsOneDimension()) {
+ // If the language only has 1D dispatches, disable Y/Z by default.
+ F->addFnAttr("amdgpu-max-num-workgroups", "4294967295,1,1");
+ }
if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
F->addFnAttr("amdgpu-ieee", "false");
diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip
index 1b53ebec9b5821..1a2cc42828c2f6 100644
--- a/clang/test/CodeGenHIP/default-attributes.hip
+++ b/clang/test/CodeGenHIP/default-attributes.hip
@@ -34,9 +34,9 @@ __global__ void kernel() {
extern_func();
}
//.
-// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
// OPTNONE: attributes #[[ATTR3]] = { convergent nounwind }
//.
// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/clang/test/OpenMP/amdgcn-attributes.cpp b/clang/test/OpenMP/amdgcn-attributes.cpp
index 2c9e16a4f5098e..270cc225d05da2 100644
--- a/clang/test/OpenMP/amdgcn-attributes.cpp
+++ b/clang/test/OpenMP/amdgcn-attributes.cpp
@@ -31,10 +31,10 @@ int callable(int x) {
return x + 1;
}
-// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
-// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
+// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
-// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
index 9f1e68d4ea0fec..ffde39479761c4 100644
--- a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
+++ b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
@@ -98,7 +98,7 @@ S A;
//
//.
// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// CHECK: attributes #[[ATTR3]] = { convergent }
// CHECK: attributes #[[ATTR4]] = { convergent nounwind }
>From d57f0603daf511141b00bd6f675f851324ad647c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 6 Dec 2024 16:20:10 -0500
Subject: [PATCH 2/2] Don't do for openmp
---
clang/include/clang/Basic/LangOptions.h | 2 +-
clang/test/OpenMP/amdgcn-attributes.cpp | 12 ++++++------
.../test/OpenMP/amdgcn_target_global_constructor.cpp | 2 +-
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index d5532eec0a683e..9a60480d6bfe11 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -708,7 +708,7 @@ class LangOptions : public LangOptionsBase {
/// Return true if the dispatch size for an offload language only uses one
/// dimension.
- bool gridSizeIsOneDimension() const { return CUDA || HIP || OpenMP; }
+ bool gridSizeIsOneDimension() const { return CUDA || HIP; }
/// Return the OpenCL C or C++ version as a VersionTuple.
VersionTuple getOpenCLVersionTuple() const;
diff --git a/clang/test/OpenMP/amdgcn-attributes.cpp b/clang/test/OpenMP/amdgcn-attributes.cpp
index 270cc225d05da2..2c9e16a4f5098e 100644
--- a/clang/test/OpenMP/amdgcn-attributes.cpp
+++ b/clang/test/OpenMP/amdgcn-attributes.cpp
@@ -31,10 +31,10 @@ int callable(int x) {
return x + 1;
}
-// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
-// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
+// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
-// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
index ffde39479761c4..9f1e68d4ea0fec 100644
--- a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
+++ b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
@@ -98,7 +98,7 @@ S A;
//
//.
// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// CHECK: attributes #[[ATTR3]] = { convergent }
// CHECK: attributes #[[ATTR4]] = { convergent nounwind }
More information about the cfe-commits
mailing list