[clang] clang/AMDGPU: Set amdgpu-max-num-workgroups to disable Y/Z by default (PR #119009)

Fri Dec 6 11:11:08 PST 2024

https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/119009

Only OpenCL supports 2d and 3d dispatches, the other languages Y and Z
dimensions are always 1. Some of the generated OpenMP functions don't
seem to get the correct attributes. The kernels do, but the callable
__omp_offloading functions are missing them for some reason.

>From 7c9c6c6897c74df7427c20b0e1305efaa2028c8c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 6 Dec 2024 13:21:41 -0500
Subject: [PATCH] clang/AMDGPU: Set amdgpu-max-num-workgroups to disable Y/Z by
 default

Only OpenCL supports 2d and 3d dispatches, the other languages Y and Z
dimensions are always 1. Some of the generated OpenMP functions don't
seem to get the correct attributes. The kernels do, but the callable
__omp_offloading functions are missing them for some reason.
---
 clang/include/clang/Basic/LangOptions.h       |  4 ++
 clang/lib/CodeGen/Targets/AMDGPU.cpp          | 56 +++++++++++--------
 clang/test/CodeGenHIP/default-attributes.hip  |  4 +-
 clang/test/OpenMP/amdgcn-attributes.cpp       | 12 ++--
 .../amdgcn_target_global_constructor.cpp      |  2 +-
 5 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 949c8f5d448bcf..d5532eec0a683e 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -706,6 +706,10 @@ class LangOptions : public LangOptionsBase {
     return OpenCL || CUDA;
   }
 
+  /// Return true if the dispatch size for an offload language only uses one
+  /// dimension.
+  bool gridSizeIsOneDimension() const { return CUDA || HIP || OpenMP; }
+
   /// Return the OpenCL C or C++ version as a VersionTuple.
   VersionTuple getOpenCLVersionTuple() const;
 
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 56ad0503a11ab2..904e03b3cc7182 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -377,29 +377,6 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
-
-  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
-    uint32_t X = Attr->getMaxNumWorkGroupsX()
-                     ->EvaluateKnownConstInt(M.getContext())
-                     .getExtValue();
-    // Y and Z dimensions default to 1 if not specified
-    uint32_t Y = Attr->getMaxNumWorkGroupsY()
-                     ? Attr->getMaxNumWorkGroupsY()
-                           ->EvaluateKnownConstInt(M.getContext())
-                           .getExtValue()
-                     : 1;
-    uint32_t Z = Attr->getMaxNumWorkGroupsZ()
-                     ? Attr->getMaxNumWorkGroupsZ()
-                           ->EvaluateKnownConstInt(M.getContext())
-                           .getExtValue()
-                     : 1;
-
-    llvm::SmallString<32> AttrVal;
-    llvm::raw_svector_ostream OS(AttrVal);
-    OS << X << ',' << Y << ',' << Z;
-
-    F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
-  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
@@ -450,9 +427,40 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
   if (!F)
     return;
 
+  // TODO: Use AttrBuilder
   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
-  if (FD)
+  const AMDGPUMaxNumWorkGroupsAttr *MaxNumWorkGroupsAttr = nullptr;
+  if (FD) {
     setFunctionDeclAttributes(FD, F, M);
+    MaxNumWorkGroupsAttr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>();
+  }
+
+  if (MaxNumWorkGroupsAttr) {
+    const auto *Attr = MaxNumWorkGroupsAttr;
+    uint32_t X = Attr->getMaxNumWorkGroupsX()
+                     ->EvaluateKnownConstInt(M.getContext())
+                     .getExtValue();
+    // Y and Z dimensions default to 1 if not specified
+    uint32_t Y = Attr->getMaxNumWorkGroupsY()
+                     ? Attr->getMaxNumWorkGroupsY()
+                           ->EvaluateKnownConstInt(M.getContext())
+                           .getExtValue()
+                     : 1;
+    uint32_t Z = Attr->getMaxNumWorkGroupsZ()
+                     ? Attr->getMaxNumWorkGroupsZ()
+                           ->EvaluateKnownConstInt(M.getContext())
+                           .getExtValue()
+                     : 1;
+
+    llvm::SmallString<32> AttrVal;
+    llvm::raw_svector_ostream OS(AttrVal);
+    OS << X << ',' << Y << ',' << Z;
+
+    F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
+  } else if (M.getLangOpts().gridSizeIsOneDimension()) {
+    // If the language only has 1D dispatches, disable Y/Z by default.
+    F->addFnAttr("amdgpu-max-num-workgroups", "4294967295,1,1");
+  }
 
   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
     F->addFnAttr("amdgpu-ieee", "false");
diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip
index 1b53ebec9b5821..1a2cc42828c2f6 100644
--- a/clang/test/CodeGenHIP/default-attributes.hip
+++ b/clang/test/CodeGenHIP/default-attributes.hip
@@ -34,9 +34,9 @@ __global__ void kernel() {
  extern_func();
 }
 //.
-// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 // OPTNONE: attributes #[[ATTR3]] = { convergent nounwind }
 //.
 // OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/clang/test/OpenMP/amdgcn-attributes.cpp b/clang/test/OpenMP/amdgcn-attributes.cpp
index 2c9e16a4f5098e..270cc225d05da2 100644
--- a/clang/test/OpenMP/amdgcn-attributes.cpp
+++ b/clang/test/OpenMP/amdgcn-attributes.cpp
@@ -31,10 +31,10 @@ int callable(int x) {
   return x + 1;
 }
 
-// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
-// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
-// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// DEFAULT: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// CPU: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="true" }
+// NOIEEE: attributes #0 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,42" "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "kernel" "no-nans-fp-math"="true" "no-trapping-math"="true" "omp_target_thread_limit"="42" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 
-// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
-// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// DEFAULT: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CPU: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" }
+// NOIEEE: attributes #2 = { convergent mustprogress noinline nounwind optnone "amdgpu-ieee"="false" "amdgpu-max-num-workgroups"="4294967295,1,1" "no-nans-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
diff --git a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
index 9f1e68d4ea0fec..ffde39479761c4 100644
--- a/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
+++ b/clang/test/OpenMP/amdgcn_target_global_constructor.cpp
@@ -98,7 +98,7 @@ S A;
 //
 //.
 // CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline nounwind optnone "amdgpu-max-num-workgroups"="4294967295,1,1" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // CHECK: attributes #[[ATTR3]] = { convergent }
 // CHECK: attributes #[[ATTR4]] = { convergent nounwind }