[llvm-branch-commits] [clang] cd95338 - [CUDA][HIP] Fix capturing reference to host variable
Yaxun Liu via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Dec 2 07:20:07 PST 2020
Author: Yaxun (Sam) Liu
Date: 2020-12-02T10:14:46-05:00
New Revision: cd95338ee3022bffd658e52cd3eb9419b4c218ca
URL: https://github.com/llvm/llvm-project/commit/cd95338ee3022bffd658e52cd3eb9419b4c218ca
DIFF: https://github.com/llvm/llvm-project/commit/cd95338ee3022bffd658e52cd3eb9419b4c218ca.diff
LOG: [CUDA][HIP] Fix capturing reference to host variable
In C++ when a reference variable is captured by copy, the lambda
is supposed to make a copy of the referenced variable in the captures
and refer to the copy in the lambda. Therefore, it is valid to capture
a reference to a host global variable in a device lambda since the
device lambda will refer to the copy of the host global variable instead
of access the host global variable directly.
However, clang tries to avoid capturing of reference to a host global variable
if it determines the use of the reference variable in the lambda function is
not odr-use. Clang also tries to emit load of the reference to a global variable
as load of the global variable if it determines that the reference variable is
a compile-time constant.
For a device lambda to capture a reference variable to host global variable
and use the captured value, clang needs to be taught that in such cases the use of the reference
variable is odr-use and the reference variable is not compile-time constant.
This patch fixes that.
Differential Revision: https://reviews.llvm.org/D91088
Added:
clang/test/CodeGenCUDA/lambda-reference-var.cu
Modified:
clang/lib/CodeGen/CGExpr.cpp
clang/lib/Sema/SemaExpr.cpp
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 325801c83de9..92d0cba7a733 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1522,6 +1522,29 @@ CodeGenFunction::tryEmitAsConstant(DeclRefExpr *refExpr) {
if (result.HasSideEffects)
return ConstantEmission();
+ // In CUDA/HIP device compilation, a lambda may capture a reference variable
+ // referencing a global host variable by copy. In this case the lambda should
+ // make a copy of the value of the global host variable. The DRE of the
+ // captured reference variable cannot be emitted as load from the host
+ // global variable as compile time constant, since the host variable is not
+ // accessible on device. The DRE of the captured reference variable has to be
+ // loaded from captures.
+ if (CGM.getLangOpts().CUDAIsDevice &&
+ refExpr->refersToEnclosingVariableOrCapture()) {
+ auto *MD = dyn_cast_or_null<CXXMethodDecl>(CurCodeDecl);
+ if (MD && MD->getParent()->isLambda() &&
+ MD->getOverloadedOperator() == OO_Call) {
+ const APValue::LValueBase &base = result.Val.getLValueBase();
+ if (const ValueDecl *D = base.dyn_cast<const ValueDecl *>()) {
+ if (const VarDecl *VD = dyn_cast<const VarDecl>(D)) {
+ if (!VD->hasAttr<CUDADeviceAttr>()) {
+ return ConstantEmission();
+ }
+ }
+ }
+ }
+ }
+
// Emit as a constant.
auto C = ConstantEmitter(*this).emitAbstract(refExpr->getLocation(),
result.Val, resultType);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 88dab26f2e3b..9c2fc1b9e6dd 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -1934,6 +1934,35 @@ Sema::BuildDeclRefExpr(ValueDecl *D, QualType Ty, ExprValueKind VK,
TemplateArgs);
}
+// CUDA/HIP: Check whether a captured reference variable is referencing a
+// host variable in a device or host device lambda.
+static bool isCapturingReferenceToHostVarInCUDADeviceLambda(const Sema &S,
+ VarDecl *VD) {
+ if (!S.getLangOpts().CUDA || !VD->hasInit())
+ return false;
+ assert(VD->getType()->isReferenceType());
+
+ // Check whether the reference variable is referencing a host variable.
+ auto *DRE = dyn_cast<DeclRefExpr>(VD->getInit());
+ if (!DRE)
+ return false;
+ auto *Referee = dyn_cast<VarDecl>(DRE->getDecl());
+ if (!Referee || !Referee->hasGlobalStorage() ||
+ Referee->hasAttr<CUDADeviceAttr>())
+ return false;
+
+ // Check whether the current function is a device or host device lambda.
+ // Check whether the reference variable is a capture by getDeclContext()
+ // since refersToEnclosingVariableOrCapture() is not ready at this point.
+ auto *MD = dyn_cast_or_null<CXXMethodDecl>(S.CurContext);
+ if (MD && MD->getParent()->isLambda() &&
+ MD->getOverloadedOperator() == OO_Call && MD->hasAttr<CUDADeviceAttr>() &&
+ VD->getDeclContext() != MD)
+ return true;
+
+ return false;
+}
+
NonOdrUseReason Sema::getNonOdrUseReasonInCurrentContext(ValueDecl *D) {
// A declaration named in an unevaluated operand never constitutes an odr-use.
if (isUnevaluatedContext())
@@ -1943,9 +1972,16 @@ NonOdrUseReason Sema::getNonOdrUseReasonInCurrentContext(ValueDecl *D) {
// A variable x whose name appears as a potentially-evaluated expression e
// is odr-used by e unless [...] x is a reference that is usable in
// constant expressions.
+ // CUDA/HIP:
+ // If a reference variable referencing a host variable is captured in a
+ // device or host device lambda, the value of the referee must be copied
+ // to the capture and the reference variable must be treated as odr-use
+ // since the value of the referee is not known at compile time and must
+ // be loaded from the captured.
if (VarDecl *VD = dyn_cast<VarDecl>(D)) {
if (VD->getType()->isReferenceType() &&
!(getLangOpts().OpenMP && isOpenMPCapturedDecl(D)) &&
+ !isCapturingReferenceToHostVarInCUDADeviceLambda(*this, VD) &&
VD->isUsableInConstantExpressions(Context))
return NOUR_Constant;
}
diff --git a/clang/test/CodeGenCUDA/lambda-reference-var.cu b/clang/test/CodeGenCUDA/lambda-reference-var.cu
new file mode 100644
index 000000000000..6d7b343b3193
--- /dev/null
+++ b/clang/test/CodeGenCUDA/lambda-reference-var.cu
@@ -0,0 +1,126 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \
+// RUN: -triple x86_64-linux-gnu \
+// RUN: | FileCheck -check-prefix=HOST %s
+// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \
+// RUN: -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN: | FileCheck -check-prefix=DEV %s
+
+#include "Inputs/cuda.h"
+
+// HOST: %[[T1:.*]] = type <{ i32*, i32, [4 x i8] }>
+// HOST: %[[T2:.*]] = type { i32*, i32** }
+// HOST: %[[T3:.*]] = type <{ i32*, i32, [4 x i8] }>
+// DEV: %[[T1:.*]] = type { i32* }
+// DEV: %[[T2:.*]] = type { i32** }
+// DEV: %[[T3:.*]] = type <{ i32*, i32, [4 x i8] }>
+int global_host_var;
+__device__ int global_device_var;
+
+template<class F>
+__global__ void kern(F f) { f(); }
+
+// DEV-LABEL: @_ZZ27dev_capture_dev_ref_by_copyPiENKUlvE_clEv(
+// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*)
+// DEV: store i32 %[[VAL]]
+__device__ void dev_capture_dev_ref_by_copy(int *out) {
+ int &ref = global_device_var;
+ [=](){ *out = ref;}();
+}
+
+// DEV-LABEL: @_ZZ26dev_capture_dev_ref_by_refPiENKUlvE_clEv(
+// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*)
+// DEV: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1
+// DEV: store i32 %[[VAL2]], i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*)
+// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*)
+// DEV: store i32 %[[VAL]]
+__device__ void dev_capture_dev_ref_by_ref(int *out) {
+ int &ref = global_device_var;
+ [&](){ ref++; *out = ref;}();
+}
+
+// DEV-LABEL: define void @_Z7dev_refPi(
+// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*)
+// DEV: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1
+// DEV: store i32 %[[VAL2]], i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*)
+// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*)
+// DEV: store i32 %[[VAL]]
+__device__ void dev_ref(int *out) {
+ int &ref = global_device_var;
+ ref++;
+ *out = ref;
+}
+
+// DEV-LABEL: @_ZZ14dev_lambda_refPiENKUlvE_clEv(
+// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*)
+// DEV: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1
+// DEV: store i32 %[[VAL2]], i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*)
+// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*)
+// DEV: store i32 %[[VAL]]
+__device__ void dev_lambda_ref(int *out) {
+ [=](){
+ int &ref = global_device_var;
+ ref++;
+ *out = ref;
+ }();
+}
+
+// HOST-LABEL: @_ZZ29host_capture_host_ref_by_copyPiENKUlvE_clEv(
+// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var
+// HOST: store i32 %[[VAL]]
+void host_capture_host_ref_by_copy(int *out) {
+ int &ref = global_host_var;
+ [=](){ *out = ref;}();
+}
+
+// HOST-LABEL: @_ZZ28host_capture_host_ref_by_refPiENKUlvE_clEv(
+// HOST: %[[CAP:.*]] = getelementptr inbounds %[[T2]], %[[T2]]* %this1, i32 0, i32 0
+// HOST: %[[REF:.*]] = load i32*, i32** %[[CAP]]
+// HOST: %[[VAL:.*]] = load i32, i32* %[[REF]]
+// HOST: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1
+// HOST: store i32 %[[VAL2]], i32* %[[REF]]
+// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var
+// HOST: store i32 %[[VAL]]
+void host_capture_host_ref_by_ref(int *out) {
+ int &ref = global_host_var;
+ [&](){ ref++; *out = ref;}();
+}
+
+// HOST-LABEL: define void @_Z8host_refPi(
+// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var
+// HOST: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1
+// HOST: store i32 %[[VAL2]], i32* @global_host_var
+// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var
+// HOST: store i32 %[[VAL]]
+void host_ref(int *out) {
+ int &ref = global_host_var;
+ ref++;
+ *out = ref;
+}
+
+// HOST-LABEL: @_ZZ15host_lambda_refPiENKUlvE_clEv(
+// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var
+// HOST: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1
+// HOST: store i32 %[[VAL2]], i32* @global_host_var
+// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var
+// HOST: store i32 %[[VAL]]
+void host_lambda_ref(int *out) {
+ [=](){
+ int &ref = global_host_var;
+ ref++;
+ *out = ref;
+ }();
+}
+
+// HOST-LABEL: define void @_Z28dev_capture_host_ref_by_copyPi(
+// HOST: %[[CAP:.*]] = getelementptr inbounds %[[T3]], %[[T3]]* %{{.*}}, i32 0, i32 1
+// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var
+// HOST: store i32 %[[VAL]], i32* %[[CAP]]
+// DEV-LABEL: define internal void @_ZZ28dev_capture_host_ref_by_copyPiENKUlvE_clEv(
+// DEV: %[[CAP:.*]] = getelementptr inbounds %[[T3]], %[[T3]]* %this1, i32 0, i32 1
+// DEV: %[[VAL:.*]] = load i32, i32* %[[CAP]]
+// DEV: store i32 %[[VAL]]
+void dev_capture_host_ref_by_copy(int *out) {
+ int &ref = global_host_var;
+ kern<<<1, 1>>>([=]__device__() { *out = ref;});
+}
+
More information about the llvm-branch-commits
mailing list