[clang] 8920ef0 - [hip] Remove the coercion on aggregate kernel arguments.
Michael Liao via cfe-commits
cfe-commits at lists.llvm.org
Thu Nov 12 18:19:41 PST 2020
Author: Michael Liao
Date: 2020-11-12T21:19:30-05:00
New Revision: 8920ef06a138c46b208fb6471d500261c4b9bacc
URL: https://github.com/llvm/llvm-project/commit/8920ef06a138c46b208fb6471d500261c4b9bacc
DIFF: https://github.com/llvm/llvm-project/commit/8920ef06a138c46b208fb6471d500261c4b9bacc.diff
LOG: [hip] Remove the coercion on aggregate kernel arguments.
- If an aggregate argument is indirectly accessed within kernels, direct
passing results in unpromotable `alloca`, which degrade performance
significantly. InferAddrSpace pass is enhanced in
[D91121](https://reviews.llvm.org/D91121) to take the assumption that
generic pointers loaded from the constant memory could be regarded
global ones. The need for the coercion on aggregate arguments is
mitigated.
Differential Revision: https://reviews.llvm.org/D89980
Added:
Modified:
clang/lib/CodeGen/TargetInfo.cpp
clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
clang/test/CodeGenCUDA/kernel-args.cu
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 63502ccf7a38..1e5920322ecd 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -8712,35 +8712,9 @@ class AMDGPUABIInfo final : public DefaultABIInfo {
bool isHomogeneousAggregateSmallEnough(const Type *Base,
uint64_t Members) const override;
- // Coerce HIP pointer arguments from generic pointers to global ones.
+ // Coerce HIP scalar pointer arguments from generic pointers to global ones.
llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
unsigned ToAS) const {
- // Structure types.
- if (auto STy = dyn_cast<llvm::StructType>(Ty)) {
- SmallVector<llvm::Type *, 8> EltTys;
- bool Changed = false;
- for (auto T : STy->elements()) {
- auto NT = coerceKernelArgumentType(T, FromAS, ToAS);
- EltTys.push_back(NT);
- Changed |= (NT != T);
- }
- // Skip if there is no change in element types.
- if (!Changed)
- return STy;
- if (STy->hasName())
- return llvm::StructType::create(
- EltTys, (STy->getName() + ".coerce").str(), STy->isPacked());
- return llvm::StructType::get(getVMContext(), EltTys, STy->isPacked());
- }
- // Array types.
- if (auto ATy = dyn_cast<llvm::ArrayType>(Ty)) {
- auto T = ATy->getElementType();
- auto NT = coerceKernelArgumentType(T, FromAS, ToAS);
- // Skip if there is no change in that element type.
- if (NT == T)
- return ATy;
- return llvm::ArrayType::get(NT, ATy->getNumElements());
- }
// Single value types.
if (Ty->isPointerTy() && Ty->getPointerAddressSpace() == FromAS)
return llvm::PointerType::get(
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index 2660a5f14f90..dc4659856026 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -9,8 +9,6 @@
// Coerced struct from `struct S` without all generic pointers lowered into
// global ones.
-// COMMON: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* }
-// COMMON: %struct.T.coerce = type { [2 x float addrspace(1)*] }
// On the host-side compilation, generic pointer won't be coerced.
// HOST-NOT: %struct.S.coerce
@@ -61,15 +59,17 @@ struct S {
// `by-val` struct will be coerced into a similar struct with all generic
// pointers lowerd into global ones.
// HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1)
-// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce)
-// OPT: [[P0:%.*]] = extractvalue %struct.S.coerce %s.coerce, 0
-// OPT: [[P1:%.*]] = extractvalue %struct.S.coerce %s.coerce, 1
-// OPT: [[V0:%.*]] = load i32, i32 addrspace(1)* [[P0]], align 4
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S addrspace(4)*{{.*}} byref(%struct.S) align 8 %0)
+// OPT: [[R0:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 0
+// OPT: [[P0:%.*]] = load i32*, i32* addrspace(4)* [[R0]], align 8
+// OPT: [[R1:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1
+// OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
+// OPT: [[V0:%.*]] = load i32, i32* [[P0]], align 4
// OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
-// OPT: store i32 [[INC]], i32 addrspace(1)* [[P0]], align 4
-// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[P1]], align 4
+// OPT: store i32 [[INC]], i32* [[P0]], align 4
+// OPT: [[V1:%.*]] = load float, float* [[P1]], align 4
// OPT: [[ADD:%.*]] = fadd contract float [[V1]], 1.000000e+00
-// OPT: store float [[ADD]], float addrspace(1)* [[P1]], align 4
+// OPT: store float [[ADD]], float* [[P1]], align 4
// OPT: ret void
__global__ void kernel4(struct S s) {
s.x[0]++;
@@ -89,16 +89,17 @@ struct T {
};
// `by-val` array is also coerced.
// HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1)
-// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce)
-// OPT: [[ARR:%.*]] = extractvalue %struct.T.coerce %t.coerce, 0
-// OPT: [[P0:%.*]] = extractvalue [2 x float addrspace(1)*] [[ARR]], 0
-// OPT: [[P1:%.*]] = extractvalue [2 x float addrspace(1)*] [[ARR]], 1
-// OPT: [[V0:%.*]] = load float, float addrspace(1)* [[P0]], align 4
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T addrspace(4)*{{.*}} byref(%struct.T) align 8 %0)
+// OPT: [[R0:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 0
+// OPT: [[P0:%.*]] = load float*, float* addrspace(4)* [[R0]], align 8
+// OPT: [[R1:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 1
+// OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
+// OPT: [[V0:%.*]] = load float, float* [[P0]], align 4
// OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00
-// OPT: store float [[ADD0]], float addrspace(1)* [[P0]], align 4
-// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[P1]], align 4
+// OPT: store float [[ADD0]], float* [[P0]], align 4
+// OPT: [[V1:%.*]] = load float, float* [[P1]], align 4
// OPT: [[ADD1:%.*]] = fadd contract float [[V1]], 2.000000e+00
-// OPT: store float [[ADD1]], float addrspace(1)* [[P1]], align 4
+// OPT: store float [[ADD1]], float* [[P1]], align 4
// OPT: ret void
__global__ void kernel6(struct T t) {
t.x[0][0] += 1.f;
@@ -111,3 +112,19 @@ __global__ void kernel6(struct T t) {
__global__ void kernel7(int *__restrict x) {
x[0]++;
}
+
+// Single element struct.
+struct SS {
+ float *x;
+};
+// HOST: define void @_Z22__device_stub__kernel82SS(float* %a.coerce)
+// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel82SS(float addrspace(1)*{{.*}} %a.coerce)
+// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
+// OPT: [[VAL:%.*]] = load float, float addrspace(1)* %a.coerce, align 4
+// OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00
+// OPT: store float [[INC]], float addrspace(1)* %a.coerce, align 4
+// OPT: ret void
+__global__ void kernel8(struct SS a) {
+ *a.x += 3.f;
+}
diff --git a/clang/test/CodeGenCUDA/kernel-args.cu b/clang/test/CodeGenCUDA/kernel-args.cu
index 1bc613c699f2..14f4a17e3993 100644
--- a/clang/test/CodeGenCUDA/kernel-args.cu
+++ b/clang/test/CodeGenCUDA/kernel-args.cu
@@ -1,22 +1,23 @@
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -fcuda-is-device \
// RUN: -emit-llvm %s -o - | FileCheck -check-prefix=AMDGCN %s
-// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda- -fcuda-is-device \
+// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \
// RUN: -emit-llvm %s -o - | FileCheck -check-prefix=NVPTX %s
#include "Inputs/cuda.h"
struct A {
int a[32];
+ float *p;
};
-// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}})
-// NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 4 %x)
+// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}})
+// NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 8 %x)
__global__ void kernel(A x) {
}
class Kernel {
public:
- // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}})
- // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 4 %x)
+ // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}})
+ // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 8 %x)
static __global__ void memberKernel(A x){}
template<typename T> static __global__ void templateMemberKernel(T x) {}
};
@@ -29,11 +30,11 @@ void launch(void*);
void test() {
Kernel K;
- // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}
- // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 4 %x)
+ // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}}
+ // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 8 %x)
launch((void*)templateKernel<A>);
- // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}
- // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 4 %x)
+ // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}}
+ // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 8 %x)
launch((void*)Kernel::templateMemberKernel<A>);
}
More information about the cfe-commits
mailing list