[clang] [AMDGPU] Synthetic return coerce for aggregates with empty-for-layout members. (PR #197004)
Abid Qadeer via cfe-commits
cfe-commits at lists.llvm.org
Wed Jun 10 08:16:11 PDT 2026
https://github.com/abidh updated https://github.com/llvm/llvm-project/pull/197004
>From 6ecc9ee71ee8b55a4c71c426e490b9b6d6dfa245 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer at amd.com>
Date: Wed, 29 Apr 2026 16:09:40 +0100
Subject: [PATCH 1/4] [AMDGPU] Synthetic return coerce for aggregates with
empty-for-layout members.
After llvm#96422, empty-for-layout members can show up as 4 x i8. This
logically empty type ends up consuming 4 VGPRs and breaks the ABI.
This PR teaches the AMDGPU ABI to use an explicit synthetic coerce
struct when it is returning a struct that can transitively contain an
empty-for-layout member. This coerce struct does not create array for
padding bytes. As a result, the fields go in the registers as ABI
expects. The numRegsForType has been fixed accordingly as well.
---
clang/lib/CodeGen/Targets/AMDGPU.cpp | 111 +++++++++++++++++-
.../amdgpu-aggregate-return-coerce.hip | 32 +++++
2 files changed, 142 insertions(+), 1 deletion(-)
create mode 100644 clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index a3a596bb9d822..06bd6076d4e50 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -9,7 +9,11 @@
#include "ABIInfoImpl.h"
#include "TargetInfo.h"
#include "clang/AST/DeclCXX.h"
+#include "clang/AST/RecordLayout.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/AMDGPUAddrSpace.h"
using namespace clang;
@@ -21,6 +25,94 @@ using namespace clang::CodeGen;
namespace {
+/// True if \p Ty is a record whose fields (or bases) include a field that
+/// is empty for layout, or that contain such a field transitively through
+/// member or base types.
+static bool recordTypeHasEmptyFieldForLayout(ASTContext &Ctx, QualType Ty) {
+ const RecordDecl *RD = Ty->getAsRecordDecl();
+ if (!RD)
+ return false;
+
+ if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+ for (const auto &B : CXXRD->bases()) {
+ if (recordTypeHasEmptyFieldForLayout(Ctx, B.getType()))
+ return true;
+ }
+ }
+
+ for (const FieldDecl *FD : RD->fields()) {
+ if (isEmptyFieldForLayout(Ctx, FD))
+ return true;
+ if (recordTypeHasEmptyFieldForLayout(Ctx, FD->getType()))
+ return true;
+ }
+ return false;
+}
+
+/// Build a LLVM struct for AMDGPU aggregate return coercion: one element per
+/// non-empty base subobject and per field, ordered by \c ASTRecordLayout
+/// offsets (matching in-object layout). Nested records that also need this
+/// coercion use a nested coerce type; otherwise \c ConvertType is used.
+static llvm::Type *buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT,
+ ASTContext &Ctx,
+ QualType Ty) {
+ if (!recordTypeHasEmptyFieldForLayout(Ctx, Ty))
+ return nullptr;
+
+ const RecordDecl *RD = Ty->getAsRecordDecl();
+ if (!RD || !RD->getDefinition() || RD->isUnion())
+ return nullptr;
+ assert(!RD->hasFlexibleArrayMember());
+
+ // Vtable and dynamic-class layout are not represented here; use the normal
+ // LLVM record type as the coerce-to type.
+ if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
+ if (CXXRD->isDynamicClass())
+ return nullptr;
+
+ const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
+
+ struct CoerceMember {
+ CharUnits Offset;
+ QualType Ty;
+ };
+ llvm::SmallVector<CoerceMember, 16> Members;
+
+ if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+ for (const CXXBaseSpecifier &B : CXXRD->bases()) {
+ const CXXRecordDecl *BaseDecl = B.getType()->getAsCXXRecordDecl();
+ if (!BaseDecl || BaseDecl->isEmpty())
+ continue;
+ BaseDecl = BaseDecl->getDefinition();
+ CharUnits Off = B.isVirtual() ? Layout.getVBaseClassOffset(BaseDecl)
+ : Layout.getBaseClassOffset(BaseDecl);
+ Members.push_back({Off, B.getType()});
+ }
+ }
+
+ for (const FieldDecl *FD : RD->fields()) {
+ CharUnits Off =
+ Ctx.toCharUnitsFromBits(Layout.getFieldOffset(FD->getFieldIndex()));
+ Members.push_back({Off, FD->getType()});
+ }
+
+ llvm::stable_sort(Members, [](const CoerceMember &A, const CoerceMember &B) {
+ return A.Offset < B.Offset;
+ });
+
+ llvm::LLVMContext &VM = CGT.getLLVMContext();
+ llvm::SmallVector<llvm::Type *, 16> Elts;
+ for (const CoerceMember &M : Members) {
+ if (llvm::Type *Nested =
+ buildAMDGPUAggregateReturnCoerceType(CGT, Ctx, M.Ty))
+ Elts.push_back(Nested);
+ else
+ Elts.push_back(CGT.ConvertType(M.Ty));
+ }
+
+ return llvm::StructType::create(VM, Elts);
+}
+
class AMDGPUABIInfo final : public DefaultABIInfo {
private:
static const unsigned MaxNumRegsForArgsRet = 16;
@@ -99,7 +191,20 @@ uint64_t AMDGPUABIInfo::numRegsForType(QualType Ty) const {
if (const auto *RD = Ty->getAsRecordDecl()) {
assert(!RD->hasFlexibleArrayMember());
+ if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+ for (const CXXBaseSpecifier &B : CXXRD->bases()) {
+ const CXXRecordDecl *BD = B.getType()->getAsCXXRecordDecl();
+ if (!BD || BD->isEmpty())
+ continue;
+ NumRegs += numRegsForType(B.getType());
+ }
+ }
+
for (const FieldDecl *Field : RD->fields()) {
+ if (isEmptyFieldForLayout(getContext(), Field)) {
+ NumRegs += 1;
+ continue;
+ }
QualType FieldTy = Field->getType();
NumRegs += numRegsForType(FieldTy);
}
@@ -169,8 +274,12 @@ ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
}
- if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
+ if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet) {
+ if (llvm::Type *CoerceTy =
+ buildAMDGPUAggregateReturnCoerceType(CGT, getContext(), RetTy))
+ return ABIArgInfo::getDirect(CoerceTy);
return ABIArgInfo::getDirect();
+ }
}
}
diff --git a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
new file mode 100644
index 0000000000000..61729285fac5f
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -fcuda-is-device -o - %s \
+// RUN: | FileCheck %s
+
+#define __device__ __attribute__((device))
+
+struct OnlyStatic {
+ __device__ static int something;
+};
+struct WithStaticFields {
+ int a[2];
+ OnlyStatic sub;
+ float b;
+ __device__ static int c;
+ double d;
+};
+__device__ int OnlyStatic::something = 42;
+__device__ int WithStaticFields::c = 12;
+
+__device__ WithStaticFields returnWithStatic() {
+ OnlyStatic::something = 12;
+ WithStaticFields::c = 42;
+ return {.a = {8, 16}, .b = 3.14f, .d = 1.60218e-19};
+}
+
+__device__ void caller() {
+ WithStaticFields r = returnWithStatic();
+ (void)r.b;
+}
+
+// CHECK-DAG: define dso_local %[[TY:.*]] @_Z16returnWithStaticv
+// CHECK-DAG: call %[[TY]] @_Z16returnWithStaticv
+// CHECK-DAG: %[[TY]] = type { [2 x i32], %struct.OnlyStatic, float, double }
>From a0b1c0168bf29000c5a279a5c97aead2c69057af Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer at amd.com>
Date: Mon, 11 May 2026 18:29:41 +0100
Subject: [PATCH 2/4] Add a base class test.
---
.../amdgpu-aggregate-return-coerce.hip | 34 +++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
index 61729285fac5f..3214279d748b3 100644
--- a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
+++ b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
@@ -27,6 +27,40 @@ __device__ void caller() {
(void)r.b;
}
+// Base class with an empty-for-layout member: return coercion must include the
+// non-empty base subobject as a nested coerce struct (not only direct fields).
+struct BaseWithEmptyMember {
+ int a[2];
+ OnlyStatic sub;
+ float b;
+ __device__ static int c;
+};
+struct DerivedWithBase : BaseWithEmptyMember {
+ double d;
+};
+__device__ int BaseWithEmptyMember::c = 12;
+
+__device__ DerivedWithBase returnDerivedWithBase() {
+ OnlyStatic::something = 12;
+ BaseWithEmptyMember::c = 42;
+ DerivedWithBase r{};
+ r.a[0] = 8;
+ r.a[1] = 16;
+ r.b = 3.14f;
+ r.d = 1.60218e-19;
+ return r;
+}
+
+__device__ void callerDerived() {
+ DerivedWithBase r = returnDerivedWithBase();
+ (void)r.b;
+}
+
// CHECK-DAG: define dso_local %[[TY:.*]] @_Z16returnWithStaticv
// CHECK-DAG: call %[[TY]] @_Z16returnWithStaticv
// CHECK-DAG: %[[TY]] = type { [2 x i32], %struct.OnlyStatic, float, double }
+
+// CHECK-DAG: define dso_local %[[DER:.*]] @_Z21returnDerivedWithBasev
+// CHECK-DAG: call %[[DER]] @_Z21returnDerivedWithBasev
+// CHECK-DAG: %[[BASE:[0-9]+]] = type { [2 x i32], %struct.OnlyStatic, float }
+// CHECK-DAG: %[[DER]] = type { %[[BASE]], double }
>From a72915a0a2c1fd9ce96827b3593f57cb3bf87b69 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer at amd.com>
Date: Thu, 14 May 2026 16:51:35 +0100
Subject: [PATCH 3/4] Use sort instead of stable_sort.
---
clang/lib/CodeGen/Targets/AMDGPU.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 06bd6076d4e50..1ace0502aaec6 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -96,7 +96,7 @@ static llvm::Type *buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT,
Members.push_back({Off, FD->getType()});
}
- llvm::stable_sort(Members, [](const CoerceMember &A, const CoerceMember &B) {
+ llvm::sort(Members, [](const CoerceMember &A, const CoerceMember &B) {
return A.Offset < B.Offset;
});
>From 35e7295ecaf013eca80d62d0ad1403ee06541e0a Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer at amd.com>
Date: Wed, 10 Jun 2026 15:44:38 +0100
Subject: [PATCH 4/4] Handle review comments.
Add more tests. Also refactor some code so that we can assert on an
unreachable code path.
---
clang/lib/CodeGen/Targets/AMDGPU.cpp | 16 +++--
.../amdgpu-aggregate-return-coerce.hip | 58 ++++++++++++++++---
2 files changed, 57 insertions(+), 17 deletions(-)
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 1ace0502aaec6..2399103f795fe 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -64,12 +64,6 @@ static llvm::Type *buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT,
return nullptr;
assert(!RD->hasFlexibleArrayMember());
- // Vtable and dynamic-class layout are not represented here; use the normal
- // LLVM record type as the coerce-to type.
- if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
- if (CXXRD->isDynamicClass())
- return nullptr;
-
const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
struct CoerceMember {
@@ -79,14 +73,18 @@ static llvm::Type *buildAMDGPUAggregateReturnCoerceType(CodeGenTypes &CGT,
llvm::SmallVector<CoerceMember, 16> Members;
if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+ // Vtable and dynamic-class layout are not represented here; use the normal
+ // LLVM record type as the coerce-to type.
+ if (CXXRD->isDynamicClass())
+ return nullptr;
for (const CXXBaseSpecifier &B : CXXRD->bases()) {
const CXXRecordDecl *BaseDecl = B.getType()->getAsCXXRecordDecl();
if (!BaseDecl || BaseDecl->isEmpty())
continue;
BaseDecl = BaseDecl->getDefinition();
- CharUnits Off = B.isVirtual() ? Layout.getVBaseClassOffset(BaseDecl)
- : Layout.getBaseClassOffset(BaseDecl);
- Members.push_back({Off, B.getType()});
+ // isDynamicClass() above guards against any class that has virtual bases
+ assert(!B.isVirtual() && "virtual base implies isDynamicClass");
+ Members.push_back({Layout.getBaseClassOffset(BaseDecl), B.getType()});
}
}
diff --git a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
index 3214279d748b3..29b38b0e8cb2e 100644
--- a/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
+++ b/clang/test/CodeGenHIP/amdgpu-aggregate-return-coerce.hip
@@ -22,11 +22,6 @@ __device__ WithStaticFields returnWithStatic() {
return {.a = {8, 16}, .b = 3.14f, .d = 1.60218e-19};
}
-__device__ void caller() {
- WithStaticFields r = returnWithStatic();
- (void)r.b;
-}
-
// Base class with an empty-for-layout member: return coercion must include the
// non-empty base subobject as a nested coerce struct (not only direct fields).
struct BaseWithEmptyMember {
@@ -51,9 +46,48 @@ __device__ DerivedWithBase returnDerivedWithBase() {
return r;
}
-__device__ void callerDerived() {
- DerivedWithBase r = returnDerivedWithBase();
- (void)r.b;
+// Empty base class
+struct EmptyBase {};
+struct WithEmptyBase : EmptyBase {
+ int a;
+ OnlyStatic sub;
+ float b;
+};
+
+__device__ WithEmptyBase returnWithEmptyBase() {
+ WithEmptyBase r{};
+ r.a = 1;
+ r.b = 2.0f;
+ return r;
+}
+
+// Derived class whose base has no empty-for-layout fields but the derived
+// class itself does. The base does not need its own coerce type, so it
+// appears in the coerce struct as the named LLVM type rather than
+// an anonymous type generated by the recursive coerce path.
+struct PlainBase { int x; float y; };
+struct DerivedWithOwnEmpty : PlainBase {
+ OnlyStatic sub;
+ double z;
+};
+
+__device__ DerivedWithOwnEmpty returnDerivedWithOwnEmpty() {
+ DerivedWithOwnEmpty r{};
+ r.x = 1;
+ r.y = 2.0f;
+ r.z = 3.0;
+ return r;
+}
+
+__device__ void caller() {
+ WithStaticFields r1 = returnWithStatic();
+ (void)r1.b;
+ DerivedWithBase r2 = returnDerivedWithBase();
+ (void)r2.b;
+ WithEmptyBase r3 = returnWithEmptyBase();
+ (void)r3.a;
+ DerivedWithOwnEmpty r4 = returnDerivedWithOwnEmpty();
+ (void)r4.z;
}
// CHECK-DAG: define dso_local %[[TY:.*]] @_Z16returnWithStaticv
@@ -64,3 +98,11 @@ __device__ void callerDerived() {
// CHECK-DAG: call %[[DER]] @_Z21returnDerivedWithBasev
// CHECK-DAG: %[[BASE:[0-9]+]] = type { [2 x i32], %struct.OnlyStatic, float }
// CHECK-DAG: %[[DER]] = type { %[[BASE]], double }
+
+// CHECK-DAG: define dso_local %[[EB:.*]] @_Z19returnWithEmptyBasev
+// CHECK-DAG: call %[[EB]] @_Z19returnWithEmptyBasev
+// CHECK-DAG: %[[EB]] = type { i32, %struct.OnlyStatic, float }
+
+// CHECK-DAG: define dso_local %[[PLAIN:.*]] @_Z25returnDerivedWithOwnEmptyv
+// CHECK-DAG: call %[[PLAIN]] @_Z25returnDerivedWithOwnEmptyv
+// CHECK-DAG: %[[PLAIN]] = type { %struct.PlainBase, %struct.OnlyStatic, double }
More information about the cfe-commits
mailing list