[flang-commits] [flang] 46caad5 - [flang][cuda] Do not produce data transfer in offloaded do concurrent (#147435)
via flang-commits
flang-commits at lists.llvm.org
Tue Jul 8 10:52:19 PDT 2025
Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-07-08T10:52:15-07:00
New Revision: 46caad52ac14cefd6f9cf3188863818e330f3844
URL: https://github.com/llvm/llvm-project/commit/46caad52ac14cefd6f9cf3188863818e330f3844
DIFF: https://github.com/llvm/llvm-project/commit/46caad52ac14cefd6f9cf3188863818e330f3844.diff
LOG: [flang][cuda] Do not produce data transfer in offloaded do concurrent (#147435)
If a `do concurrent` loop is offloaded then there should be no CUDA data
transfer in it. Update the semantic and lowering to take that into
account.
`AssignmentChecker` has to be put into a separate pass because the
checkers in `SemanticsVisitor` cannot have the same `Enter/Leave`
functions. The `DoForallChecker` already has `Eneter/Leave` functions
for the `DoConstruct`.
Added:
Modified:
flang/include/flang/Optimizer/Builder/CUFCommon.h
flang/include/flang/Support/Fortran-features.h
flang/lib/Lower/Bridge.cpp
flang/lib/Optimizer/Builder/CUFCommon.cpp
flang/lib/Semantics/assignment.cpp
flang/lib/Semantics/assignment.h
flang/lib/Semantics/check-cuda.cpp
flang/lib/Semantics/check-cuda.h
flang/test/Lower/CUDA/cuda-data-transfer.cuf
flang/tools/bbc/bbc.cpp
Removed:
################################################################################
diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h
index 65b9cce1d2021..5c56dd6b695f8 100644
--- a/flang/include/flang/Optimizer/Builder/CUFCommon.h
+++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h
@@ -27,7 +27,8 @@ mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
mlir::SymbolTable &symTab);
bool isCUDADeviceContext(mlir::Operation *op);
-bool isCUDADeviceContext(mlir::Region &);
+bool isCUDADeviceContext(mlir::Region &,
+ bool isDoConcurrentOffloadEnabled = false);
bool isRegisteredDeviceGlobal(fir::GlobalOp op);
bool isRegisteredDeviceAttr(std::optional<cuf::DataAttribute> attr);
diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h
index 8a2b7b29a5233..857de9479e4e5 100644
--- a/flang/include/flang/Support/Fortran-features.h
+++ b/flang/include/flang/Support/Fortran-features.h
@@ -55,7 +55,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank,
IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor,
ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy,
- InaccessibleDeferredOverride, CudaWarpMatchFunction)
+ InaccessibleDeferredOverride, CudaWarpMatchFunction, DoConcurrentOffload)
// Portability and suspicious usage warnings
ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index ff35840a6668c..696473605a4e0 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -4886,7 +4886,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
mlir::Location loc = getCurrentLocation();
fir::FirOpBuilder &builder = getFirOpBuilder();
- bool isInDeviceContext = cuf::isCUDADeviceContext(builder.getRegion());
+ bool isInDeviceContext = cuf::isCUDADeviceContext(
+ builder.getRegion(),
+ getFoldingContext().languageFeatures().IsEnabled(
+ Fortran::common::LanguageFeature::DoConcurrentOffload));
bool isCUDATransfer =
IsCUDADataTransfer(assign.lhs, assign.rhs) && !isInDeviceContext;
diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp
index 5f286c04a7ca0..dcbf4991907bf 100644
--- a/flang/lib/Optimizer/Builder/CUFCommon.cpp
+++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp
@@ -43,7 +43,8 @@ bool cuf::isCUDADeviceContext(mlir::Operation *op) {
// for it.
// If the insertion point is inside an OpenACC region op, it is considered
// device context.
-bool cuf::isCUDADeviceContext(mlir::Region ®ion) {
+bool cuf::isCUDADeviceContext(mlir::Region ®ion,
+ bool isDoConcurrentOffloadEnabled) {
if (region.getParentOfType<cuf::KernelOp>())
return true;
if (region.getParentOfType<mlir::acc::ComputeRegionOpInterface>())
@@ -56,6 +57,9 @@ bool cuf::isCUDADeviceContext(mlir::Region ®ion) {
cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
}
}
+ if (isDoConcurrentOffloadEnabled &&
+ region.getParentOfType<fir::DoConcurrentLoopOp>())
+ return true;
return false;
}
diff --git a/flang/lib/Semantics/assignment.cpp b/flang/lib/Semantics/assignment.cpp
index 43e23a9d8f60b..88e08887160d9 100644
--- a/flang/lib/Semantics/assignment.cpp
+++ b/flang/lib/Semantics/assignment.cpp
@@ -42,7 +42,6 @@ class AssignmentContext {
void Analyze(const parser::AssignmentStmt &);
void Analyze(const parser::PointerAssignmentStmt &);
void Analyze(const parser::ConcurrentControl &);
- int deviceConstructDepth_{0};
SemanticsContext &context() { return context_; }
private:
@@ -97,21 +96,6 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
if (whereDepth_ > 0) {
CheckShape(lhsLoc, &lhs);
}
- if (context_.foldingContext().languageFeatures().IsEnabled(
- common::LanguageFeature::CUDA)) {
- const auto &scope{context_.FindScope(lhsLoc)};
- const Scope &progUnit{GetProgramUnitContaining(scope)};
- if (!IsCUDADeviceContext(&progUnit) && deviceConstructDepth_ == 0) {
- if (Fortran::evaluate::HasCUDADeviceAttrs(lhs) &&
- Fortran::evaluate::HasCUDAImplicitTransfer(rhs)) {
- if (GetNbOfCUDAManagedOrUnifiedSymbols(lhs) == 1 &&
- GetNbOfCUDAManagedOrUnifiedSymbols(rhs) == 1 &&
- GetNbOfCUDADeviceSymbols(rhs) == 1)
- return; // This is a special case handled on the host.
- context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
- }
- }
- }
}
}
@@ -254,46 +238,6 @@ void AssignmentChecker::Enter(const parser::MaskedElsewhereStmt &x) {
void AssignmentChecker::Leave(const parser::MaskedElsewhereStmt &) {
context_.value().PopWhereContext();
}
-void AssignmentChecker::Enter(const parser::CUFKernelDoConstruct &x) {
- ++context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Leave(const parser::CUFKernelDoConstruct &) {
- --context_.value().deviceConstructDepth_;
-}
-static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
- const auto &beginBlockDirective =
- std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
- const auto &blockDirective =
- std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
- if (blockDirective.v == llvm::acc::ACCD_parallel ||
- blockDirective.v == llvm::acc::ACCD_serial ||
- blockDirective.v == llvm::acc::ACCD_kernels) {
- return true;
- }
- return false;
-}
-void AssignmentChecker::Enter(const parser::OpenACCBlockConstruct &x) {
- if (IsOpenACCComputeConstruct(x)) {
- ++context_.value().deviceConstructDepth_;
- }
-}
-void AssignmentChecker::Leave(const parser::OpenACCBlockConstruct &x) {
- if (IsOpenACCComputeConstruct(x)) {
- --context_.value().deviceConstructDepth_;
- }
-}
-void AssignmentChecker::Enter(const parser::OpenACCCombinedConstruct &) {
- ++context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Leave(const parser::OpenACCCombinedConstruct &) {
- --context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Enter(const parser::OpenACCLoopConstruct &) {
- ++context_.value().deviceConstructDepth_;
-}
-void AssignmentChecker::Leave(const parser::OpenACCLoopConstruct &) {
- --context_.value().deviceConstructDepth_;
-}
} // namespace Fortran::semantics
template class Fortran::common::Indirection<
diff --git a/flang/lib/Semantics/assignment.h b/flang/lib/Semantics/assignment.h
index 4a1bb92037119..ba537744bfaaa 100644
--- a/flang/lib/Semantics/assignment.h
+++ b/flang/lib/Semantics/assignment.h
@@ -46,14 +46,6 @@ class AssignmentChecker : public virtual BaseChecker {
void Leave(const parser::EndWhereStmt &);
void Enter(const parser::MaskedElsewhereStmt &);
void Leave(const parser::MaskedElsewhereStmt &);
- void Enter(const parser::CUFKernelDoConstruct &);
- void Leave(const parser::CUFKernelDoConstruct &);
- void Enter(const parser::OpenACCBlockConstruct &);
- void Leave(const parser::OpenACCBlockConstruct &);
- void Enter(const parser::OpenACCCombinedConstruct &);
- void Leave(const parser::OpenACCCombinedConstruct &);
- void Enter(const parser::OpenACCLoopConstruct &);
- void Leave(const parser::OpenACCLoopConstruct &);
SemanticsContext &context();
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 8decfb0149829..b01147606a99c 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -685,18 +685,67 @@ void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
std::get<std::list<parser::CUFReduction>>(directive.t)) {
CheckReduce(context_, reduce);
}
- inCUFKernelDoConstruct_ = true;
+ ++deviceConstructDepth_;
+}
+
+static bool IsOpenACCComputeConstruct(const parser::OpenACCBlockConstruct &x) {
+ const auto &beginBlockDirective =
+ std::get<Fortran::parser::AccBeginBlockDirective>(x.t);
+ const auto &blockDirective =
+ std::get<Fortran::parser::AccBlockDirective>(beginBlockDirective.t);
+ if (blockDirective.v == llvm::acc::ACCD_parallel ||
+ blockDirective.v == llvm::acc::ACCD_serial ||
+ blockDirective.v == llvm::acc::ACCD_kernels) {
+ return true;
+ }
+ return false;
}
void CUDAChecker::Leave(const parser::CUFKernelDoConstruct &) {
- inCUFKernelDoConstruct_ = false;
+ --deviceConstructDepth_;
+}
+void CUDAChecker::Enter(const parser::OpenACCBlockConstruct &x) {
+ if (IsOpenACCComputeConstruct(x)) {
+ ++deviceConstructDepth_;
+ }
+}
+void CUDAChecker::Leave(const parser::OpenACCBlockConstruct &x) {
+ if (IsOpenACCComputeConstruct(x)) {
+ --deviceConstructDepth_;
+ }
+}
+void CUDAChecker::Enter(const parser::OpenACCCombinedConstruct &) {
+ ++deviceConstructDepth_;
+}
+void CUDAChecker::Leave(const parser::OpenACCCombinedConstruct &) {
+ --deviceConstructDepth_;
+}
+void CUDAChecker::Enter(const parser::OpenACCLoopConstruct &) {
+ ++deviceConstructDepth_;
+}
+void CUDAChecker::Leave(const parser::OpenACCLoopConstruct &) {
+ --deviceConstructDepth_;
+}
+void CUDAChecker::Enter(const parser::DoConstruct &x) {
+ if (x.IsDoConcurrent() &&
+ context_.foldingContext().languageFeatures().IsEnabled(
+ common::LanguageFeature::DoConcurrentOffload)) {
+ ++deviceConstructDepth_;
+ }
+}
+void CUDAChecker::Leave(const parser::DoConstruct &x) {
+ if (x.IsDoConcurrent() &&
+ context_.foldingContext().languageFeatures().IsEnabled(
+ common::LanguageFeature::DoConcurrentOffload)) {
+ --deviceConstructDepth_;
+ }
}
void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
auto lhsLoc{std::get<parser::Variable>(x.t).GetSource()};
const auto &scope{context_.FindScope(lhsLoc)};
const Scope &progUnit{GetProgramUnitContaining(scope)};
- if (IsCUDADeviceContext(&progUnit) || inCUFKernelDoConstruct_) {
+ if (IsCUDADeviceContext(&progUnit) || deviceConstructDepth_ > 0) {
return; // Data transfer with assignment is only perform on host.
}
@@ -714,6 +763,16 @@ void CUDAChecker::Enter(const parser::AssignmentStmt &x) {
context_.Say(lhsLoc,
"More than one reference to a CUDA object on the right hand side of the assigment"_err_en_US);
}
+
+ if (Fortran::evaluate::HasCUDADeviceAttrs(assign->lhs) &&
+ Fortran::evaluate::HasCUDAImplicitTransfer(assign->rhs)) {
+ if (GetNbOfCUDAManagedOrUnifiedSymbols(assign->lhs) == 1 &&
+ GetNbOfCUDAManagedOrUnifiedSymbols(assign->rhs) == 1 &&
+ GetNbOfCUDADeviceSymbols(assign->rhs) == 1) {
+ return; // This is a special case handled on the host.
+ }
+ context_.Say(lhsLoc, "Unsupported CUDA data transfer"_err_en_US);
+ }
}
} // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/check-cuda.h b/flang/lib/Semantics/check-cuda.h
index 222a2ee04b57c..10000253ffe5a 100644
--- a/flang/lib/Semantics/check-cuda.h
+++ b/flang/lib/Semantics/check-cuda.h
@@ -41,10 +41,18 @@ class CUDAChecker : public virtual BaseChecker {
void Enter(const parser::CUFKernelDoConstruct &);
void Leave(const parser::CUFKernelDoConstruct &);
void Enter(const parser::AssignmentStmt &);
+ void Enter(const parser::OpenACCBlockConstruct &);
+ void Leave(const parser::OpenACCBlockConstruct &);
+ void Enter(const parser::OpenACCCombinedConstruct &);
+ void Leave(const parser::OpenACCCombinedConstruct &);
+ void Enter(const parser::OpenACCLoopConstruct &);
+ void Leave(const parser::OpenACCLoopConstruct &);
+ void Enter(const parser::DoConstruct &);
+ void Leave(const parser::DoConstruct &);
private:
SemanticsContext &context_;
- bool inCUFKernelDoConstruct_ = false;
+ int deviceConstructDepth_{0};
};
bool CanonicalizeCUDA(parser::Program &);
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index d0032af415604..68a0202f951fe 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -403,3 +403,19 @@ end subroutine
! CHECK-LABEL: func.func @_QPsub20()
! CHECK-NOT: cuf.data_transfer
! CHECK: hlfir.assign
+
+subroutine sub21()
+ real, allocatable,device:: a(:,:), b(:,:)
+ real:: s
+ integer:: i,j,N=16
+ allocate(a(N,N),b(N,N))
+ do concurrent(i=1:N, j=1:N) reduce(+:s)
+ b(i,j)=a(i,j)**2
+ s=s+b(i,j)
+ end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub21()
+! CHECK: fir.do_concurrent.loop
+! CHECK-NOT: cuf.data_transfer
+! CHECK: hlfir.assign
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index 59372a8eb58ed..d216069654bc4 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -223,6 +223,11 @@ static llvm::cl::opt<bool> enableCUDA("fcuda",
llvm::cl::desc("enable CUDA Fortran"),
llvm::cl::init(false));
+static llvm::cl::opt<bool>
+ enableDoConcurrentOffload("fdoconcurrent-offload",
+ llvm::cl::desc("enable do concurrent offload"),
+ llvm::cl::init(false));
+
static llvm::cl::opt<bool>
disableCUDAWarpFunction("fcuda-disable-warp-function",
llvm::cl::desc("Disable CUDA Warp Function"),
@@ -608,6 +613,11 @@ int main(int argc, char **argv) {
options.features.Enable(Fortran::common::LanguageFeature::CUDA);
}
+ if (enableDoConcurrentOffload) {
+ options.features.Enable(
+ Fortran::common::LanguageFeature::DoConcurrentOffload);
+ }
+
if (disableCUDAWarpFunction) {
options.features.Enable(
Fortran::common::LanguageFeature::CudaWarpMatchFunction, false);
More information about the flang-commits
mailing list