[flang-commits] [flang] [mlir] [mlir][acc] Added async to data clause operations. (PR #97307)

Slava Zakharin via flang-commits flang-commits at lists.llvm.org
Mon Jul 1 08:18:23 PDT 2024


https://github.com/vzakhari created https://github.com/llvm/llvm-project/pull/97307

As long as the data clause operations are not tightly
"associated" with the compute/data operations (e.g.
they can be optimized as SSA producers and made block
arguments), the information about the original async()
clause should be attached to the data clause operations
to make it easier to generate proper runtime actions
for them. This change propagates the async() information
from the OpenACC data/compute constructs to the data clause
operations. This change also adds the CurrentDeviceIdResource
to guarantee proper ordering of the operations that read
and write the current device identifier.


>From a2b3a1d32bf2e166d4300477959c8e33e9583b07 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Mon, 1 Jul 2024 06:09:12 -0700
Subject: [PATCH] [mlir][acc] Added async to data clause operations.

As long as the data clause operations are not tightly
"associated" with the compute/data operations (e.g.
they can be optimized as SSA producers and made block
arguments), the information about the original async()
clause should be attached to the data clause operations
to make it easier to generate proper runtime actions
for them. This change propagates the async() information
from the OpenACC data/compute constructs to the data clause
operations. This change also adds the CurrentDeviceIdResource
to guarantee proper ordering of the operations that read
and write the current device identifier.
---
 flang/lib/Lower/OpenACC.cpp                   | 411 ++++++++++++------
 flang/test/Lower/OpenACC/acc-data.f90         |   6 +-
 flang/test/Lower/OpenACC/acc-enter-data.f90   |  12 +-
 flang/test/Lower/OpenACC/acc-exit-data.f90    |  16 +-
 flang/test/Lower/OpenACC/acc-parallel.f90     |  14 +-
 flang/test/Lower/OpenACC/acc-serial.f90       |   4 +-
 flang/test/Lower/OpenACC/acc-update.f90       |  24 +-
 mlir/include/mlir/Dialect/OpenACC/OpenACC.h   |  20 +
 .../mlir/Dialect/OpenACC/OpenACCOps.td        | 235 +++++++++-
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       |  30 ++
 10 files changed, 580 insertions(+), 192 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 166fa686cd883..6266a5056ace8 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -58,13 +58,34 @@ genOperandLocation(Fortran::lower::AbstractConverter &converter,
   return loc;
 }
 
+static void addOperands(llvm::SmallVectorImpl<mlir::Value> &operands,
+                        llvm::SmallVectorImpl<int32_t> &operandSegments,
+                        llvm::ArrayRef<mlir::Value> clauseOperands) {
+  operands.append(clauseOperands.begin(), clauseOperands.end());
+  operandSegments.push_back(clauseOperands.size());
+}
+
+static void addOperand(llvm::SmallVectorImpl<mlir::Value> &operands,
+                       llvm::SmallVectorImpl<int32_t> &operandSegments,
+                       const mlir::Value &clauseOperand) {
+  if (clauseOperand) {
+    operands.push_back(clauseOperand);
+    operandSegments.push_back(1);
+  } else {
+    operandSegments.push_back(0);
+  }
+}
+
 template <typename Op>
-static Op createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value baseAddr, std::stringstream &name,
-                            mlir::SmallVector<mlir::Value> bounds,
-                            bool structured, bool implicit,
-                            mlir::acc::DataClause dataClause, mlir::Type retTy,
-                            mlir::Value isPresent = {}) {
+static Op
+createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
+                  mlir::Value baseAddr, std::stringstream &name,
+                  mlir::SmallVector<mlir::Value> bounds, bool structured,
+                  bool implicit, mlir::acc::DataClause dataClause,
+                  mlir::Type retTy, llvm::ArrayRef<mlir::Value> async,
+                  llvm::ArrayRef<mlir::Attribute> asyncDeviceTypes,
+                  llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes,
+                  mlir::Value isPresent = {}) {
   mlir::Value varPtrPtr;
   if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
     if (isPresent) {
@@ -92,20 +113,25 @@ static Op createDataEntryOp(fir::FirOpBuilder &builder, mlir::Location loc,
     retTy = baseAddr.getType();
   }
 
-  Op op = builder.create<Op>(loc, retTy, baseAddr);
+  llvm::SmallVector<mlir::Value, 8> operands;
+  llvm::SmallVector<int32_t, 8> operandSegments;
+
+  addOperand(operands, operandSegments, baseAddr);
+  addOperand(operands, operandSegments, varPtrPtr);
+  addOperands(operands, operandSegments, bounds);
+  addOperands(operands, operandSegments, async);
+
+  Op op = builder.create<Op>(loc, retTy, operands);
   op.setNameAttr(builder.getStringAttr(name.str()));
   op.setStructured(structured);
   op.setImplicit(implicit);
   op.setDataClause(dataClause);
-
-  unsigned insPos = 1;
-  if (varPtrPtr)
-    op->insertOperands(insPos++, varPtrPtr);
-  if (bounds.size() > 0)
-    op->insertOperands(insPos, bounds);
   op->setAttr(Op::getOperandSegmentSizeAttr(),
-              builder.getDenseI32ArrayAttr(
-                  {1, varPtrPtr ? 1 : 0, static_cast<int32_t>(bounds.size())}));
+              builder.getDenseI32ArrayAttr(operandSegments));
+  if (!asyncDeviceTypes.empty())
+    op.setAsyncOperandsDeviceTypeAttr(builder.getArrayAttr(asyncDeviceTypes));
+  if (!asyncOnlyDeviceTypes.empty())
+    op.setAsyncOnlyAttr(builder.getArrayAttr(asyncOnlyDeviceTypes));
   return op;
 }
 
@@ -174,7 +200,8 @@ static void createDeclareAllocFuncWithArg(mlir::OpBuilder &modBuilder,
       createDataEntryOp<mlir::acc::UpdateDeviceOp>(
           builder, loc, registerFuncOp.getArgument(0), asFortranDesc, bounds,
           /*structured=*/false, /*implicit=*/true,
-          mlir::acc::DataClause::acc_update_device, descTy);
+          mlir::acc::DataClause::acc_update_device, descTy,
+          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
   llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 1};
   llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
   createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
@@ -185,7 +212,8 @@ static void createDeclareAllocFuncWithArg(mlir::OpBuilder &modBuilder,
   addDeclareAttr(builder, boxAddrOp.getOperation(), clause);
   EntryOp entryOp = createDataEntryOp<EntryOp>(
       builder, loc, boxAddrOp.getResult(), asFortran, bounds,
-      /*structured=*/false, /*implicit=*/false, clause, boxAddrOp.getType());
+      /*structured=*/false, /*implicit=*/false, clause, boxAddrOp.getType(),
+      /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
   builder.create<mlir::acc::DeclareEnterOp>(
       loc, mlir::acc::DeclareTokenType::get(entryOp.getContext()),
       mlir::ValueRange(entryOp.getAccPtr()));
@@ -217,8 +245,8 @@ static void createDeclareDeallocFuncWithArg(
   mlir::acc::GetDevicePtrOp entryOp =
       createDataEntryOp<mlir::acc::GetDevicePtrOp>(
           builder, loc, boxAddrOp.getResult(), asFortran, bounds,
-          /*structured=*/false, /*implicit=*/false, clause,
-          boxAddrOp.getType());
+          /*structured=*/false, /*implicit=*/false, clause, boxAddrOp.getType(),
+          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
   builder.create<mlir::acc::DeclareExitOp>(
       loc, mlir::Value{}, mlir::ValueRange(entryOp.getAccPtr()));
 
@@ -226,12 +254,16 @@ static void createDeclareDeallocFuncWithArg(
                 std::is_same_v<ExitOp, mlir::acc::UpdateHostOp>)
     builder.create<ExitOp>(entryOp.getLoc(), entryOp.getAccPtr(),
                            entryOp.getVarPtr(), entryOp.getBounds(),
-                           entryOp.getDataClause(),
+                           entryOp.getAsyncOperands(),
+                           entryOp.getAsyncOperandsDeviceTypeAttr(),
+                           entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
                            /*structured=*/false, /*implicit=*/false,
                            builder.getStringAttr(*entryOp.getName()));
   else
     builder.create<ExitOp>(entryOp.getLoc(), entryOp.getAccPtr(),
-                           entryOp.getBounds(), entryOp.getDataClause(),
+                           entryOp.getBounds(), entryOp.getAsyncOperands(),
+                           entryOp.getAsyncOperandsDeviceTypeAttr(),
+                           entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
                            /*structured=*/false, /*implicit=*/false,
                            builder.getStringAttr(*entryOp.getName()));
 
@@ -248,7 +280,8 @@ static void createDeclareDeallocFuncWithArg(
       createDataEntryOp<mlir::acc::UpdateDeviceOp>(
           builder, loc, loadOp, asFortran, bounds,
           /*structured=*/false, /*implicit=*/true,
-          mlir::acc::DataClause::acc_update_device, loadOp.getType());
+          mlir::acc::DataClause::acc_update_device, loadOp.getType(),
+          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
   llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 1};
   llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
   createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
@@ -290,7 +323,10 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
                          Fortran::lower::StatementContext &stmtCtx,
                          llvm::SmallVectorImpl<mlir::Value> &dataOperands,
                          mlir::acc::DataClause dataClause, bool structured,
-                         bool implicit, bool setDeclareAttr = false) {
+                         bool implicit, llvm::ArrayRef<mlir::Value> async,
+                         llvm::ArrayRef<mlir::Attribute> asyncDeviceTypes,
+                         llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes,
+                         bool setDeclareAttr = false) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
   for (const auto &accObject : objectList.v) {
@@ -316,7 +352,8 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
             : info.addr;
     Op op = createDataEntryOp<Op>(builder, operandLocation, baseAddr, asFortran,
                                   bounds, structured, implicit, dataClause,
-                                  baseAddr.getType(), info.isPresent);
+                                  baseAddr.getType(), async, asyncDeviceTypes,
+                                  asyncOnlyDeviceTypes, info.isPresent);
     dataOperands.push_back(op.getAccPtr());
   }
 }
@@ -345,7 +382,8 @@ static void genDeclareDataOperandOperations(
             operandLocation, asFortran, bounds);
     EntryOp op = createDataEntryOp<EntryOp>(
         builder, operandLocation, info.addr, asFortran, bounds, structured,
-        implicit, dataClause, info.addr.getType());
+        implicit, dataClause, info.addr.getType(),
+        /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
     dataOperands.push_back(op.getAccPtr());
     addDeclareAttr(builder, op.getVarPtr().getDefiningOp(), dataClause);
     if (mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(info.addr.getType()))) {
@@ -397,13 +435,16 @@ static void genDataExitOperations(fir::FirOpBuilder &builder,
                   std::is_same_v<ExitOp, mlir::acc::UpdateHostOp>)
       builder.create<ExitOp>(
           entryOp.getLoc(), entryOp.getAccPtr(), entryOp.getVarPtr(),
-          entryOp.getBounds(), entryOp.getDataClause(), structured,
-          entryOp.getImplicit(), builder.getStringAttr(*entryOp.getName()));
+          entryOp.getBounds(), entryOp.getAsyncOperands(),
+          entryOp.getAsyncOperandsDeviceTypeAttr(), entryOp.getAsyncOnlyAttr(),
+          entryOp.getDataClause(), structured, entryOp.getImplicit(),
+          builder.getStringAttr(*entryOp.getName()));
     else
-      builder.create<ExitOp>(entryOp.getLoc(), entryOp.getAccPtr(),
-                             entryOp.getBounds(), entryOp.getDataClause(),
-                             structured, entryOp.getImplicit(),
-                             builder.getStringAttr(*entryOp.getName()));
+      builder.create<ExitOp>(
+          entryOp.getLoc(), entryOp.getAccPtr(), entryOp.getBounds(),
+          entryOp.getAsyncOperands(), entryOp.getAsyncOperandsDeviceTypeAttr(),
+          entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(), structured,
+          entryOp.getImplicit(), builder.getStringAttr(*entryOp.getName()));
   }
 }
 
@@ -783,7 +824,10 @@ genPrivatizations(const Fortran::parser::AccObjectList &objectList,
                   Fortran::semantics::SemanticsContext &semanticsContext,
                   Fortran::lower::StatementContext &stmtCtx,
                   llvm::SmallVectorImpl<mlir::Value> &dataOperands,
-                  llvm::SmallVector<mlir::Attribute> &privatizations) {
+                  llvm::SmallVector<mlir::Attribute> &privatizations,
+                  llvm::ArrayRef<mlir::Value> async,
+                  llvm::ArrayRef<mlir::Attribute> asyncDeviceTypes,
+                  llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext};
   for (const auto &accObject : objectList.v) {
@@ -808,7 +852,8 @@ genPrivatizations(const Fortran::parser::AccObjectList &objectList,
                                                         operandLocation, retTy);
       auto op = createDataEntryOp<mlir::acc::PrivateOp>(
           builder, operandLocation, info.addr, asFortran, bounds, true,
-          /*implicit=*/false, mlir::acc::DataClause::acc_private, retTy);
+          /*implicit=*/false, mlir::acc::DataClause::acc_private, retTy, async,
+          asyncDeviceTypes, asyncOnlyDeviceTypes);
       dataOperands.push_back(op.getAccPtr());
     } else {
       std::string suffix =
@@ -819,7 +864,8 @@ genPrivatizations(const Fortran::parser::AccObjectList &objectList,
           builder, recipeName, operandLocation, retTy, bounds);
       auto op = createDataEntryOp<mlir::acc::FirstprivateOp>(
           builder, operandLocation, info.addr, asFortran, bounds, true,
-          /*implicit=*/false, mlir::acc::DataClause::acc_firstprivate, retTy);
+          /*implicit=*/false, mlir::acc::DataClause::acc_firstprivate, retTy,
+          async, asyncDeviceTypes, asyncOnlyDeviceTypes);
       dataOperands.push_back(op.getAccPtr());
     }
     privatizations.push_back(mlir::SymbolRefAttr::get(
@@ -1354,7 +1400,10 @@ genReductions(const Fortran::parser::AccObjectListWithReduction &objectList,
               Fortran::semantics::SemanticsContext &semanticsContext,
               Fortran::lower::StatementContext &stmtCtx,
               llvm::SmallVectorImpl<mlir::Value> &reductionOperands,
-              llvm::SmallVector<mlir::Attribute> &reductionRecipes) {
+              llvm::SmallVector<mlir::Attribute> &reductionRecipes,
+              llvm::ArrayRef<mlir::Value> async,
+              llvm::ArrayRef<mlir::Attribute> asyncDeviceTypes,
+              llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   const auto &objects = std::get<Fortran::parser::AccObjectList>(objectList.t);
   const auto &op = std::get<Fortran::parser::ReductionOperator>(objectList.t);
@@ -1383,7 +1432,8 @@ genReductions(const Fortran::parser::AccObjectListWithReduction &objectList,
     auto op = createDataEntryOp<mlir::acc::ReductionOp>(
         builder, operandLocation, info.addr, asFortran, bounds,
         /*structured=*/true, /*implicit=*/false,
-        mlir::acc::DataClause::acc_reduction, info.addr.getType());
+        mlir::acc::DataClause::acc_reduction, info.addr.getType(), async,
+        asyncDeviceTypes, asyncOnlyDeviceTypes);
     mlir::Type ty = op.getAccPtr().getType();
     if (!areAllBoundConstant(bounds) ||
         fir::isAssumedShape(info.addr.getType()) ||
@@ -1404,25 +1454,6 @@ genReductions(const Fortran::parser::AccObjectListWithReduction &objectList,
   }
 }
 
-static void
-addOperands(llvm::SmallVectorImpl<mlir::Value> &operands,
-            llvm::SmallVectorImpl<int32_t> &operandSegments,
-            const llvm::SmallVectorImpl<mlir::Value> &clauseOperands) {
-  operands.append(clauseOperands.begin(), clauseOperands.end());
-  operandSegments.push_back(clauseOperands.size());
-}
-
-static void addOperand(llvm::SmallVectorImpl<mlir::Value> &operands,
-                       llvm::SmallVectorImpl<int32_t> &operandSegments,
-                       const mlir::Value &clauseOperand) {
-  if (clauseOperand) {
-    operands.push_back(clauseOperand);
-    operandSegments.push_back(1);
-  } else {
-    operandSegments.push_back(0);
-  }
-}
-
 template <typename Op, typename Terminator>
 static Op
 createRegionOp(fir::FirOpBuilder &builder, mlir::Location loc,
@@ -1656,7 +1687,8 @@ static void privatizeIv(Fortran::lower::AbstractConverter &converter,
   std::stringstream asFortran;
   auto op = createDataEntryOp<mlir::acc::PrivateOp>(
       builder, loc, ivValue, asFortran, {}, true, /*implicit=*/true,
-      mlir::acc::DataClause::acc_private, ivValue.getType());
+      mlir::acc::DataClause::acc_private, ivValue.getType(),
+      /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
 
   privateOperands.push_back(op.getAccPtr());
   privatizations.push_back(mlir::SymbolRefAttr::get(builder.getContext(),
@@ -1897,12 +1929,14 @@ static mlir::acc::LoopOp createLoopOp(
                        &clause.u)) {
       genPrivatizations<mlir::acc::PrivateRecipeOp>(
           privateClause->v, converter, semanticsContext, stmtCtx,
-          privateOperands, privatizations);
+          privateOperands, privatizations, /*async=*/{},
+          /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
     } else if (const auto *reductionClause =
                    std::get_if<Fortran::parser::AccClause::Reduction>(
                        &clause.u)) {
       genReductions(reductionClause->v, converter, semanticsContext, stmtCtx,
-                    reductionOperands, reductionRecipes);
+                    reductionOperands, reductionRecipes, /*async=*/{},
+                    /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
     } else if (std::get_if<Fortran::parser::AccClause::Seq>(&clause.u)) {
       for (auto crtDeviceTypeAttr : crtDeviceTypes)
         seqDeviceTypes.push_back(crtDeviceTypeAttr);
@@ -2088,6 +2122,9 @@ static void genDataOperandOperationsWithModifier(
     llvm::SmallVectorImpl<mlir::Value> &dataClauseOperands,
     const mlir::acc::DataClause clause,
     const mlir::acc::DataClause clauseWithModifier,
+    llvm::ArrayRef<mlir::Value> async,
+    llvm::ArrayRef<mlir::Attribute> asyncDeviceTypes,
+    llvm::ArrayRef<mlir::Attribute> asyncOnlyDeviceTypes,
     bool setDeclareAttr = false) {
   const Fortran::parser::AccObjectListWithModifier &listWithModifier = x->v;
   const auto &accObjectList =
@@ -2099,7 +2136,8 @@ static void genDataOperandOperationsWithModifier(
       (modifier && (*modifier).v == mod) ? clauseWithModifier : clause;
   genDataOperandOperations<Op>(accObjectList, converter, semanticsContext,
                                stmtCtx, dataClauseOperands, dataClause,
-                               /*structured=*/true, /*implicit=*/false,
+                               /*structured=*/true, /*implicit=*/false, async,
+                               asyncDeviceTypes, asyncOnlyDeviceTypes,
                                setDeclareAttr);
 }
 
@@ -2150,8 +2188,9 @@ static Op createComputeOp(
   // Lower clauses values mapped to operands and array attributes.
   // Keep track of each group of operands separately as clauses can appear
   // more than once.
+
+  // Process the clauses that may have a specified device_type first.
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
-    mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *asyncClause =
             std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
       genAsyncClause(converter, asyncClause, async, asyncDeviceTypes,
@@ -2193,8 +2232,19 @@ static Op createComputeOp(
         vectorLength.push_back(vectorLengthValue);
         vectorLengthDeviceTypes.push_back(crtDeviceTypeAttr);
       }
-    } else if (const auto *ifClause =
-                   std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
+    } else if (const auto *deviceTypeClause =
+                   std::get_if<Fortran::parser::AccClause::DeviceType>(
+                       &clause.u)) {
+      crtDeviceTypes.clear();
+      gatherDeviceTypeAttrs(builder, deviceTypeClause, crtDeviceTypes);
+    }
+  }
+
+  // Process the clauses independent of device_type.
+  for (const Fortran::parser::AccClause &clause : accClauseList.v) {
+    mlir::Location clauseLocation = converter.genLocation(clause.source);
+    if (const auto *ifClause =
+            std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
       genIfClause(converter, clauseLocation, ifClause, ifCond, stmtCtx);
     } else if (const auto *selfClause =
                    std::get_if<Fortran::parser::AccClause::Self>(&clause.u)) {
@@ -2237,7 +2287,8 @@ static Op createComputeOp(
       genDataOperandOperations<mlir::acc::CopyinOp>(
           copyClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_copy,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
       copyEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                dataClauseOperands.end());
     } else if (const auto *copyinClause =
@@ -2247,7 +2298,8 @@ static Op createComputeOp(
           copyinClause, converter, semanticsContext, stmtCtx,
           Fortran::parser::AccDataModifier::Modifier::ReadOnly,
           dataClauseOperands, mlir::acc::DataClause::acc_copyin,
-          mlir::acc::DataClause::acc_copyin_readonly);
+          mlir::acc::DataClause::acc_copyin_readonly, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *copyoutClause =
                    std::get_if<Fortran::parser::AccClause::Copyout>(
                        &clause.u)) {
@@ -2257,7 +2309,8 @@ static Op createComputeOp(
           copyoutClause, converter, semanticsContext, stmtCtx,
           Fortran::parser::AccDataModifier::Modifier::ReadOnly,
           dataClauseOperands, mlir::acc::DataClause::acc_copyout,
-          mlir::acc::DataClause::acc_copyout_zero);
+          mlir::acc::DataClause::acc_copyout_zero, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
       copyoutEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                   dataClauseOperands.end());
     } else if (const auto *createClause =
@@ -2268,7 +2321,8 @@ static Op createComputeOp(
           createClause, converter, semanticsContext, stmtCtx,
           Fortran::parser::AccDataModifier::Modifier::Zero, dataClauseOperands,
           mlir::acc::DataClause::acc_create,
-          mlir::acc::DataClause::acc_create_zero);
+          mlir::acc::DataClause::acc_create_zero, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
       createEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                  dataClauseOperands.end());
     } else if (const auto *noCreateClause =
@@ -2277,28 +2331,32 @@ static Op createComputeOp(
       genDataOperandOperations<mlir::acc::NoCreateOp>(
           noCreateClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_no_create,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *presentClause =
                    std::get_if<Fortran::parser::AccClause::Present>(
                        &clause.u)) {
       genDataOperandOperations<mlir::acc::PresentOp>(
           presentClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_present,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *devicePtrClause =
                    std::get_if<Fortran::parser::AccClause::Deviceptr>(
                        &clause.u)) {
       genDataOperandOperations<mlir::acc::DevicePtrOp>(
           devicePtrClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_deviceptr,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *attachClause =
                    std::get_if<Fortran::parser::AccClause::Attach>(&clause.u)) {
       auto crtDataStart = dataClauseOperands.size();
       genDataOperandOperations<mlir::acc::AttachOp>(
           attachClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_attach,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
       attachEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                  dataClauseOperands.end());
     } else if (const auto *privateClause =
@@ -2307,13 +2365,15 @@ static Op createComputeOp(
       if (!combinedConstructs)
         genPrivatizations<mlir::acc::PrivateRecipeOp>(
             privateClause->v, converter, semanticsContext, stmtCtx,
-            privateOperands, privatizations);
+            privateOperands, privatizations, async, asyncDeviceTypes,
+            asyncOnlyDeviceTypes);
     } else if (const auto *firstprivateClause =
                    std::get_if<Fortran::parser::AccClause::Firstprivate>(
                        &clause.u)) {
       genPrivatizations<mlir::acc::FirstprivateRecipeOp>(
           firstprivateClause->v, converter, semanticsContext, stmtCtx,
-          firstprivateOperands, firstPrivatizations);
+          firstprivateOperands, firstPrivatizations, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *reductionClause =
                    std::get_if<Fortran::parser::AccClause::Reduction>(
                        &clause.u)) {
@@ -2324,14 +2384,16 @@ static Op createComputeOp(
       // instead.
       if (!combinedConstructs) {
         genReductions(reductionClause->v, converter, semanticsContext, stmtCtx,
-                      reductionOperands, reductionRecipes);
+                      reductionOperands, reductionRecipes, async,
+                      asyncDeviceTypes, asyncOnlyDeviceTypes);
       } else {
         auto crtDataStart = dataClauseOperands.size();
         genDataOperandOperations<mlir::acc::CopyinOp>(
             std::get<Fortran::parser::AccObjectList>(reductionClause->v.t),
             converter, semanticsContext, stmtCtx, dataClauseOperands,
             mlir::acc::DataClause::acc_reduction,
-            /*structured=*/true, /*implicit=*/true);
+            /*structured=*/true, /*implicit=*/true, async, asyncDeviceTypes,
+            asyncOnlyDeviceTypes);
         copyEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                  dataClauseOperands.end());
       }
@@ -2343,11 +2405,6 @@ static Op createComputeOp(
       else if ((defaultClause->v).v ==
                llvm::acc::DefaultValue::ACC_Default_present)
         hasDefaultPresent = true;
-    } else if (const auto *deviceTypeClause =
-                   std::get_if<Fortran::parser::AccClause::DeviceType>(
-                       &clause.u)) {
-      crtDeviceTypes.clear();
-      gatherDeviceTypeAttrs(builder, deviceTypeClause, crtDeviceTypes);
     }
   }
 
@@ -2480,6 +2537,28 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
   // Lower clauses values mapped to operands and array attributes.
   // Keep track of each group of operands separately as clauses can appear
   // more than once.
+
+  // Process the clauses that may have a specified device_type first.
+  for (const Fortran::parser::AccClause &clause : accClauseList.v) {
+    if (const auto *asyncClause =
+            std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
+      genAsyncClause(converter, asyncClause, async, asyncDeviceTypes,
+                     asyncOnlyDeviceTypes, crtDeviceTypes, stmtCtx);
+    } else if (const auto *waitClause =
+                   std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
+      genWaitClauseWithDeviceType(converter, waitClause, waitOperands,
+                                  waitOperandsDeviceTypes, waitOnlyDeviceTypes,
+                                  hasWaitDevnums, waitOperandsSegments,
+                                  crtDeviceTypes, stmtCtx);
+    } else if (const auto *deviceTypeClause =
+                   std::get_if<Fortran::parser::AccClause::DeviceType>(
+                       &clause.u)) {
+      crtDeviceTypes.clear();
+      gatherDeviceTypeAttrs(builder, deviceTypeClause, crtDeviceTypes);
+    }
+  }
+
+  // Process the clauses independent of device_type.
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *ifClause =
@@ -2491,7 +2570,8 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
       genDataOperandOperations<mlir::acc::CopyinOp>(
           copyClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_copy,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
       copyEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                dataClauseOperands.end());
     } else if (const auto *copyinClause =
@@ -2501,7 +2581,8 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
           copyinClause, converter, semanticsContext, stmtCtx,
           Fortran::parser::AccDataModifier::Modifier::ReadOnly,
           dataClauseOperands, mlir::acc::DataClause::acc_copyin,
-          mlir::acc::DataClause::acc_copyin_readonly);
+          mlir::acc::DataClause::acc_copyin_readonly, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *copyoutClause =
                    std::get_if<Fortran::parser::AccClause::Copyout>(
                        &clause.u)) {
@@ -2511,7 +2592,8 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
           copyoutClause, converter, semanticsContext, stmtCtx,
           Fortran::parser::AccDataModifier::Modifier::Zero, dataClauseOperands,
           mlir::acc::DataClause::acc_copyout,
-          mlir::acc::DataClause::acc_copyout_zero);
+          mlir::acc::DataClause::acc_copyout_zero, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
       copyoutEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                   dataClauseOperands.end());
     } else if (const auto *createClause =
@@ -2522,7 +2604,8 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
           createClause, converter, semanticsContext, stmtCtx,
           Fortran::parser::AccDataModifier::Modifier::Zero, dataClauseOperands,
           mlir::acc::DataClause::acc_create,
-          mlir::acc::DataClause::acc_create_zero);
+          mlir::acc::DataClause::acc_create_zero, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
       createEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                  dataClauseOperands.end());
     } else if (const auto *noCreateClause =
@@ -2531,51 +2614,40 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
       genDataOperandOperations<mlir::acc::NoCreateOp>(
           noCreateClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_no_create,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *presentClause =
                    std::get_if<Fortran::parser::AccClause::Present>(
                        &clause.u)) {
       genDataOperandOperations<mlir::acc::PresentOp>(
           presentClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_present,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *deviceptrClause =
                    std::get_if<Fortran::parser::AccClause::Deviceptr>(
                        &clause.u)) {
       genDataOperandOperations<mlir::acc::DevicePtrOp>(
           deviceptrClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_deviceptr,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *attachClause =
                    std::get_if<Fortran::parser::AccClause::Attach>(&clause.u)) {
       auto crtDataStart = dataClauseOperands.size();
       genDataOperandOperations<mlir::acc::AttachOp>(
           attachClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_attach,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, async, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
       attachEntryOperands.append(dataClauseOperands.begin() + crtDataStart,
                                  dataClauseOperands.end());
-    } else if (const auto *asyncClause =
-                   std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
-      genAsyncClause(converter, asyncClause, async, asyncDeviceTypes,
-                     asyncOnlyDeviceTypes, crtDeviceTypes, stmtCtx);
-    } else if (const auto *waitClause =
-                   std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
-      genWaitClauseWithDeviceType(converter, waitClause, waitOperands,
-                                  waitOperandsDeviceTypes, waitOnlyDeviceTypes,
-                                  hasWaitDevnums, waitOperandsSegments,
-                                  crtDeviceTypes, stmtCtx);
     } else if(const auto *defaultClause = 
                   std::get_if<Fortran::parser::AccClause::Default>(&clause.u)) {
       if ((defaultClause->v).v == llvm::acc::DefaultValue::ACC_Default_none)
         hasDefaultNone = true;
       else if ((defaultClause->v).v == llvm::acc::DefaultValue::ACC_Default_present)
         hasDefaultPresent = true;
-    } else if (const auto *deviceTypeClause =
-                   std::get_if<Fortran::parser::AccClause::DeviceType>(
-                       &clause.u)) {
-      crtDeviceTypes.clear();
-      gatherDeviceTypeAttrs(builder, deviceTypeClause, crtDeviceTypes);
     }
   }
 
@@ -2655,7 +2727,8 @@ genACCHostDataOp(Fortran::lower::AbstractConverter &converter,
       genDataOperandOperations<mlir::acc::UseDeviceOp>(
           useDevice->v, converter, semanticsContext, stmtCtx, dataOperands,
           mlir::acc::DataClause::acc_use_device,
-          /*structured=*/true, /*implicit=*/false);
+          /*structured=*/true, /*implicit=*/false, /*async=*/{},
+          /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
     } else if (std::get_if<Fortran::parser::AccClause::IfPresent>(&clause.u)) {
       addIfPresentAttr = true;
     }
@@ -2792,14 +2865,34 @@ genACCEnterDataOp(Fortran::lower::AbstractConverter &converter,
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separately as clauses can appear
   // more than once.
+
+  // Process the async clause first.
+  for (const Fortran::parser::AccClause &clause : accClauseList.v) {
+    if (const auto *asyncClause =
+            std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
+      genAsyncClause(converter, asyncClause, async, addAsyncAttr, stmtCtx);
+    }
+  }
+
+  // The async clause of 'enter data' applies to all device types,
+  // so propagate the async clause to copyin/create/attach ops
+  // as if it is an async clause without preceding device_type clause.
+  llvm::SmallVector<mlir::Attribute> asyncDeviceTypes, asyncOnlyDeviceTypes;
+  llvm::SmallVector<mlir::Value> asyncValues;
+  auto noneDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+      firOpBuilder.getContext(), mlir::acc::DeviceType::None);
+  if (addAsyncAttr) {
+    asyncOnlyDeviceTypes.push_back(noneDeviceTypeAttr);
+  } else if (async) {
+    asyncValues.push_back(async);
+    asyncDeviceTypes.push_back(noneDeviceTypeAttr);
+  }
+
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *ifClause =
             std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
       genIfClause(converter, clauseLocation, ifClause, ifCond, stmtCtx);
-    } else if (const auto *asyncClause =
-                   std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
-      genAsyncClause(converter, asyncClause, async, addAsyncAttr, stmtCtx);
     } else if (const auto *waitClause =
                    std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
       genWaitClause(converter, waitClause, waitOperands, waitDevnum,
@@ -2813,7 +2906,8 @@ genACCEnterDataOp(Fortran::lower::AbstractConverter &converter,
       genDataOperandOperations<mlir::acc::CopyinOp>(
           accObjectList, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_copyin, false,
-          /*implicit=*/false);
+          /*implicit=*/false, asyncValues, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *createClause =
                    std::get_if<Fortran::parser::AccClause::Create>(&clause.u)) {
       const Fortran::parser::AccObjectListWithModifier &listWithModifier =
@@ -2829,14 +2923,16 @@ genACCEnterDataOp(Fortran::lower::AbstractConverter &converter,
         clause = mlir::acc::DataClause::acc_create_zero;
       genDataOperandOperations<mlir::acc::CreateOp>(
           accObjectList, converter, semanticsContext, stmtCtx,
-          dataClauseOperands, clause, false, /*implicit=*/false);
+          dataClauseOperands, clause, false, /*implicit=*/false, asyncValues,
+          asyncDeviceTypes, asyncOnlyDeviceTypes);
     } else if (const auto *attachClause =
                    std::get_if<Fortran::parser::AccClause::Attach>(&clause.u)) {
       genDataOperandOperations<mlir::acc::AttachOp>(
           attachClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_attach, false,
-          /*implicit=*/false);
-    } else {
+          /*implicit=*/false, asyncValues, asyncDeviceTypes,
+          asyncOnlyDeviceTypes);
+    } else if (!std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
       llvm::report_fatal_error(
           "Unknown clause in ENTER DATA directive lowering");
     }
@@ -2882,14 +2978,34 @@ genACCExitDataOp(Fortran::lower::AbstractConverter &converter,
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separately as clauses can appear
   // more than once.
+
+  // Process the async clause first.
+  for (const Fortran::parser::AccClause &clause : accClauseList.v) {
+    if (const auto *asyncClause =
+            std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
+      genAsyncClause(converter, asyncClause, async, addAsyncAttr, stmtCtx);
+    }
+  }
+
+  // The async clause of 'exit data' applies to all device types,
+  // so propagate the async clause to copyin/create/attach ops
+  // as if it is an async clause without preceding device_type clause.
+  llvm::SmallVector<mlir::Attribute> asyncDeviceTypes, asyncOnlyDeviceTypes;
+  llvm::SmallVector<mlir::Value> asyncValues;
+  auto noneDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None);
+  if (addAsyncAttr) {
+    asyncOnlyDeviceTypes.push_back(noneDeviceTypeAttr);
+  } else if (async) {
+    asyncValues.push_back(async);
+    asyncDeviceTypes.push_back(noneDeviceTypeAttr);
+  }
+
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *ifClause =
             std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
       genIfClause(converter, clauseLocation, ifClause, ifCond, stmtCtx);
-    } else if (const auto *asyncClause =
-                   std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
-      genAsyncClause(converter, asyncClause, async, addAsyncAttr, stmtCtx);
     } else if (const auto *waitClause =
                    std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
       genWaitClause(converter, waitClause, waitOperands, waitDevnum,
@@ -2903,17 +3019,20 @@ genACCExitDataOp(Fortran::lower::AbstractConverter &converter,
           std::get<Fortran::parser::AccObjectList>(listWithModifier.t);
       genDataOperandOperations<mlir::acc::GetDevicePtrOp>(
           accObjectList, converter, semanticsContext, stmtCtx, copyoutOperands,
-          mlir::acc::DataClause::acc_copyout, false, /*implicit=*/false);
+          mlir::acc::DataClause::acc_copyout, false, /*implicit=*/false,
+          asyncValues, asyncDeviceTypes, asyncOnlyDeviceTypes);
     } else if (const auto *deleteClause =
                    std::get_if<Fortran::parser::AccClause::Delete>(&clause.u)) {
       genDataOperandOperations<mlir::acc::GetDevicePtrOp>(
           deleteClause->v, converter, semanticsContext, stmtCtx, deleteOperands,
-          mlir::acc::DataClause::acc_delete, false, /*implicit=*/false);
+          mlir::acc::DataClause::acc_delete, false, /*implicit=*/false,
+          asyncValues, asyncDeviceTypes, asyncOnlyDeviceTypes);
     } else if (const auto *detachClause =
                    std::get_if<Fortran::parser::AccClause::Detach>(&clause.u)) {
       genDataOperandOperations<mlir::acc::GetDevicePtrOp>(
           detachClause->v, converter, semanticsContext, stmtCtx, detachOperands,
-          mlir::acc::DataClause::acc_detach, false, /*implicit=*/false);
+          mlir::acc::DataClause::acc_detach, false, /*implicit=*/false,
+          asyncValues, asyncDeviceTypes, asyncOnlyDeviceTypes);
     } else if (std::get_if<Fortran::parser::AccClause::Finalize>(&clause.u)) {
       addFinalizeAttr = true;
     }
@@ -3089,13 +3208,11 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
   // Lower clauses values mapped to operands and array attributes.
   // Keep track of each group of operands separately as clauses can appear
   // more than once.
+
+  // Process the clauses that may have a specified device_type first.
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
-    mlir::Location clauseLocation = converter.genLocation(clause.source);
-    if (const auto *ifClause =
-            std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
-      genIfClause(converter, clauseLocation, ifClause, ifCond, stmtCtx);
-    } else if (const auto *asyncClause =
-                   std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
+    if (const auto *asyncClause =
+            std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
       genAsyncClause(converter, asyncClause, asyncOperands,
                      asyncOperandsDeviceTypes, asyncOnlyDeviceTypes,
                      crtDeviceTypes, stmtCtx);
@@ -3110,18 +3227,29 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
                        &clause.u)) {
       crtDeviceTypes.clear();
       gatherDeviceTypeAttrs(builder, deviceTypeClause, crtDeviceTypes);
+    }
+  }
+
+  // Process the clauses independent of device_type.
+  for (const Fortran::parser::AccClause &clause : accClauseList.v) {
+    mlir::Location clauseLocation = converter.genLocation(clause.source);
+    if (const auto *ifClause =
+            std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
+      genIfClause(converter, clauseLocation, ifClause, ifCond, stmtCtx);
     } else if (const auto *hostClause =
                    std::get_if<Fortran::parser::AccClause::Host>(&clause.u)) {
       genDataOperandOperations<mlir::acc::GetDevicePtrOp>(
           hostClause->v, converter, semanticsContext, stmtCtx,
           updateHostOperands, mlir::acc::DataClause::acc_update_host, false,
-          /*implicit=*/false);
+          /*implicit=*/false, asyncOperands, asyncOperandsDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (const auto *deviceClause =
                    std::get_if<Fortran::parser::AccClause::Device>(&clause.u)) {
       genDataOperandOperations<mlir::acc::UpdateDeviceOp>(
           deviceClause->v, converter, semanticsContext, stmtCtx,
           dataClauseOperands, mlir::acc::DataClause::acc_update_device, false,
-          /*implicit=*/false);
+          /*implicit=*/false, asyncOperands, asyncOperandsDeviceTypes,
+          asyncOnlyDeviceTypes);
     } else if (std::get_if<Fortran::parser::AccClause::IfPresent>(&clause.u)) {
       ifPresent = true;
     } else if (const auto *selfClause =
@@ -3134,7 +3262,8 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
       genDataOperandOperations<mlir::acc::GetDevicePtrOp>(
           *accObjectList, converter, semanticsContext, stmtCtx,
           updateHostOperands, mlir::acc::DataClause::acc_update_self, false,
-          /*implicit=*/false);
+          /*implicit=*/false, asyncOperands, asyncOperandsDeviceTypes,
+          asyncOnlyDeviceTypes);
     }
   }
 
@@ -3275,7 +3404,8 @@ static void createDeclareGlobalOp(mlir::OpBuilder &modBuilder,
   llvm::SmallVector<mlir::Value> bounds;
   EntryOp entryOp = createDataEntryOp<EntryOp>(
       builder, loc, addrOp.getResTy(), asFortran, bounds,
-      /*structured=*/false, implicit, clause, addrOp.getResTy().getType());
+      /*structured=*/false, implicit, clause, addrOp.getResTy().getType(),
+      /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
   if constexpr (std::is_same_v<DeclareOp, mlir::acc::DeclareEnterOp>)
     builder.create<DeclareOp>(
         loc, mlir::acc::DeclareTokenType::get(entryOp.getContext()),
@@ -3285,7 +3415,9 @@ static void createDeclareGlobalOp(mlir::OpBuilder &modBuilder,
                               mlir::ValueRange(entryOp.getAccPtr()));
   if constexpr (std::is_same_v<GlobalOp, mlir::acc::GlobalDestructorOp>) {
     builder.create<ExitOp>(entryOp.getLoc(), entryOp.getAccPtr(),
-                           entryOp.getBounds(), entryOp.getDataClause(),
+                           entryOp.getBounds(), entryOp.getAsyncOperands(),
+                           entryOp.getAsyncOperandsDeviceTypeAttr(),
+                           entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
                            /*structured=*/false, /*implicit=*/false,
                            builder.getStringAttr(*entryOp.getName()));
   }
@@ -3319,7 +3451,8 @@ static void createDeclareAllocFunc(mlir::OpBuilder &modBuilder,
       createDataEntryOp<mlir::acc::UpdateDeviceOp>(
           builder, loc, addrOp, asFortranDesc, bounds,
           /*structured=*/false, /*implicit=*/true,
-          mlir::acc::DataClause::acc_update_device, addrOp.getType());
+          mlir::acc::DataClause::acc_update_device, addrOp.getType(),
+          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
   llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 1};
   llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
   createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
@@ -3329,7 +3462,8 @@ static void createDeclareAllocFunc(mlir::OpBuilder &modBuilder,
   addDeclareAttr(builder, boxAddrOp.getOperation(), clause);
   EntryOp entryOp = createDataEntryOp<EntryOp>(
       builder, loc, boxAddrOp.getResult(), asFortran, bounds,
-      /*structured=*/false, /*implicit=*/false, clause, boxAddrOp.getType());
+      /*structured=*/false, /*implicit=*/false, clause, boxAddrOp.getType(),
+      /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
   builder.create<mlir::acc::DeclareEnterOp>(
       loc, mlir::acc::DeclareTokenType::get(entryOp.getContext()),
       mlir::ValueRange(entryOp.getAccPtr()));
@@ -3366,8 +3500,8 @@ static void createDeclareDeallocFunc(mlir::OpBuilder &modBuilder,
   mlir::acc::GetDevicePtrOp entryOp =
       createDataEntryOp<mlir::acc::GetDevicePtrOp>(
           builder, loc, boxAddrOp.getResult(), asFortran, bounds,
-          /*structured=*/false, /*implicit=*/false, clause,
-          boxAddrOp.getType());
+          /*structured=*/false, /*implicit=*/false, clause, boxAddrOp.getType(),
+          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
 
   builder.create<mlir::acc::DeclareExitOp>(
       loc, mlir::Value{}, mlir::ValueRange(entryOp.getAccPtr()));
@@ -3376,12 +3510,16 @@ static void createDeclareDeallocFunc(mlir::OpBuilder &modBuilder,
                 std::is_same_v<ExitOp, mlir::acc::UpdateHostOp>)
     builder.create<ExitOp>(entryOp.getLoc(), entryOp.getAccPtr(),
                            entryOp.getVarPtr(), entryOp.getBounds(),
-                           entryOp.getDataClause(),
+                           entryOp.getAsyncOperands(),
+                           entryOp.getAsyncOperandsDeviceTypeAttr(),
+                           entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
                            /*structured=*/false, /*implicit=*/false,
                            builder.getStringAttr(*entryOp.getName()));
   else
     builder.create<ExitOp>(entryOp.getLoc(), entryOp.getAccPtr(),
-                           entryOp.getBounds(), entryOp.getDataClause(),
+                           entryOp.getBounds(), entryOp.getAsyncOperands(),
+                           entryOp.getAsyncOperandsDeviceTypeAttr(),
+                           entryOp.getAsyncOnlyAttr(), entryOp.getDataClause(),
                            /*structured=*/false, /*implicit=*/false,
                            builder.getStringAttr(*entryOp.getName()));
 
@@ -3400,7 +3538,8 @@ static void createDeclareDeallocFunc(mlir::OpBuilder &modBuilder,
       createDataEntryOp<mlir::acc::UpdateDeviceOp>(
           builder, loc, addrOp, asFortran, bounds,
           /*structured=*/false, /*implicit=*/true,
-          mlir::acc::DataClause::acc_update_device, addrOp.getType());
+          mlir::acc::DataClause::acc_update_device, addrOp.getType(),
+          /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{});
   llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 1};
   llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
   createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
@@ -4046,7 +4185,9 @@ genACC(Fortran::lower::AbstractConverter &converter,
     genDataOperandOperations<mlir::acc::CacheOp>(
         accObjectList, converter, semanticsContext, stmtCtx, cacheOperands,
         dataClause,
-        /*structured=*/true, /*implicit=*/false, /*setDeclareAttr*/ false);
+        /*structured=*/true, /*implicit=*/false,
+        /*async=*/{}, /*asyncDeviceTypes=*/{}, /*asyncOnlyDeviceTypes=*/{},
+        /*setDeclareAttr*/ false);
     loopOp.getCacheOperandsMutable().append(cacheOperands);
   } else {
     llvm::report_fatal_error(
diff --git a/flang/test/Lower/OpenACC/acc-data.f90 b/flang/test/Lower/OpenACC/acc-data.f90
index f120be272991a..6e0ecb9129061 100644
--- a/flang/test/Lower/OpenACC/acc-data.f90
+++ b/flang/test/Lower/OpenACC/acc-data.f90
@@ -155,11 +155,13 @@ subroutine acc_data
 ! CHECK: acc.data dataOperands(%{{.*}}) {
 ! CHECK: } attributes {asyncOnly = [#acc.device_type<none>]}
 
-  !$acc data present(a) async(1)
+  !$acc data copy(a) async(1)
   !$acc end data
 
-! CHECK: acc.data async(%{{.*}} : i32) dataOperands(%{{.*}}) {
+! CHECK: %[[COPYIN:.*]] = acc.copyin varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async(%[[ASYNC:.*]] : i32) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
+! CHECK: acc.data async(%[[ASYNC]] : i32) dataOperands(%[[COPYIN]] : !fir.ref<!fir.array<10x10xf32>>) {
 ! CHECK: }{{$}}
+! CHECK: acc.copyout accPtr(%[[COPYIN]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async(%[[ASYNC]] : i32) to varPtr(%{{.*}} : !fir.ref<!fir.array<10x10xf32>>) {dataClause = #acc<data_clause acc_copy>, name = "a"}
 
   !$acc data present(a) wait
   !$acc end data
diff --git a/flang/test/Lower/OpenACC/acc-enter-data.f90 b/flang/test/Lower/OpenACC/acc-enter-data.f90
index 251edbf9c2dd0..80326a1012376 100644
--- a/flang/test/Lower/OpenACC/acc-enter-data.f90
+++ b/flang/test/Lower/OpenACC/acc-enter-data.f90
@@ -94,7 +94,7 @@ subroutine acc_enter_data
   !$acc enter data create(a) async
 !CHECK: %[[BOUND0:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index)
 !CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[EXTENT_C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index)
-!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND0]], %[[BOUND1]]) -> !fir.ref<!fir.array<10x10xf32>> {name = "a", structured = false}
+!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND0]], %[[BOUND1]]) -> !fir.ref<!fir.array<10x10xf32>> {asyncOnly = [#acc.device_type<none>], name = "a", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE_A]] : !fir.ref<!fir.array<10x10xf32>>) attributes {async}
 
   !$acc enter data create(a) wait
@@ -106,22 +106,22 @@ subroutine acc_enter_data
   !$acc enter data create(a) async wait
 !CHECK: %[[BOUND0:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index)
 !CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[EXTENT_C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index)
-!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND0]], %[[BOUND1]]) -> !fir.ref<!fir.array<10x10xf32>> {name = "a", structured = false}
+!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND0]], %[[BOUND1]]) -> !fir.ref<!fir.array<10x10xf32>> {asyncOnly = [#acc.device_type<none>], name = "a", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE_A]] : !fir.ref<!fir.array<10x10xf32>>) attributes {async, wait}
 
   !$acc enter data create(a) async(1)
+!CHECK: %[[ASYNC1:.*]] = arith.constant 1 : i32
 !CHECK: %[[BOUND0:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index)
 !CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[EXTENT_C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index)
-!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND0]], %[[BOUND1]]) -> !fir.ref<!fir.array<10x10xf32>> {name = "a", structured = false}
-!CHECK: %[[ASYNC1:.*]] = arith.constant 1 : i32
+!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND0]], %[[BOUND1]]) async(%[[ASYNC1]] : i32) -> !fir.ref<!fir.array<10x10xf32>> {name = "a", structured = false}
 !CHECK: acc.enter_data async(%[[ASYNC1]] : i32) dataOperands(%[[CREATE_A]] : !fir.ref<!fir.array<10x10xf32>>)
 
   !$acc enter data create(a) async(async)
+!CHECK: %[[ASYNC2:.*]] = fir.load %{{.*}} : !fir.ref<i32>
 !CHECK: %[[BOUND0:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index)
 !CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) extent(%[[EXTENT_C10]] : index) stride(%c1{{.*}} : index) startIdx(%{{.*}} : index)
 
-!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND0]], %[[BOUND1]]) -> !fir.ref<!fir.array<10x10xf32>> {name = "a", structured = false}
-!CHECK: %[[ASYNC2:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+!CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%[[BOUND0]], %[[BOUND1]]) async(%[[ASYNC2]] : i32) -> !fir.ref<!fir.array<10x10xf32>> {name = "a", structured = false}
 !CHECK: acc.enter_data async(%[[ASYNC2]] : i32) dataOperands(%[[CREATE_A]] : !fir.ref<!fir.array<10x10xf32>>)
 
   !$acc enter data create(a) wait(1)
diff --git a/flang/test/Lower/OpenACC/acc-exit-data.f90 b/flang/test/Lower/OpenACC/acc-exit-data.f90
index 6600f08d5bcfe..017f1f38f8397 100644
--- a/flang/test/Lower/OpenACC/acc-exit-data.f90
+++ b/flang/test/Lower/OpenACC/acc-exit-data.f90
@@ -56,9 +56,9 @@ subroutine acc_exit_data
 !CHECK: acc.detach accPtr(%[[DEVPTR_D]] : !fir.ptr<f32>) {name = "d", structured = false}
 
   !$acc exit data delete(a) async
-!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_delete>, name = "a", structured = false}
+!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {asyncOnly = [#acc.device_type<none>], dataClause = #acc<data_clause acc_delete>, name = "a", structured = false}
 !CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) attributes {async}
-!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {name = "a", structured = false}
+!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {asyncOnly = [#acc.device_type<none>], name = "a", structured = false}
 
   !$acc exit data delete(a) wait
 !CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_delete>, name = "a", structured = false}
@@ -66,22 +66,22 @@ subroutine acc_exit_data
 !CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {name = "a", structured = false}
 
   !$acc exit data delete(a) async wait
-!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_delete>, name = "a", structured = false}
+!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {asyncOnly = [#acc.device_type<none>], dataClause = #acc<data_clause acc_delete>, name = "a", structured = false}
 !CHECK: acc.exit_data dataOperands(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) attributes {async, wait}
-!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {name = "a", structured = false}
+!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {asyncOnly = [#acc.device_type<none>], name = "a", structured = false}
 
   !$acc exit data delete(a) async(1)
-!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_delete>, name = "a", structured = false}
 !CHECK: %[[ASYNC1:.*]] = arith.constant 1 : i32
+!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async(%[[ASYNC1]] : i32) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_delete>, name = "a", structured = false}
 !CHECK: acc.exit_data async(%[[ASYNC1]] : i32) dataOperands(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>)
-!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {name = "a", structured = false}
+!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async(%[[ASYNC1]] : i32) {name = "a", structured = false}
 
 
   !$acc exit data delete(a) async(async)
-!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_delete>, name = "a", structured = false}
 !CHECK: %[[ASYNC2:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+!CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async(%[[ASYNC2]] : i32) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_delete>, name = "a", structured = false}
 !CHECK: acc.exit_data async(%[[ASYNC2]] : i32) dataOperands(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>)
-!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) {name = "a", structured = false}
+!CHECK: acc.delete accPtr(%[[DEVPTR]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async(%[[ASYNC2]] : i32) {name = "a", structured = false}
 
   !$acc exit data delete(a) wait(1)
 !CHECK: %[[DEVPTR:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_delete>, name = "a", structured = false}
diff --git a/flang/test/Lower/OpenACC/acc-parallel.f90 b/flang/test/Lower/OpenACC/acc-parallel.f90
index 4b18a8d037f22..5197e2b0bee09 100644
--- a/flang/test/Lower/OpenACC/acc-parallel.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel.f90
@@ -60,7 +60,7 @@ subroutine acc_parallel
   !$acc parallel async
   !$acc end parallel
 
-! CHECK:      acc.parallel {
+! CHECK: acc.parallel {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
 
@@ -76,7 +76,7 @@ subroutine acc_parallel
   !$acc end parallel
 
 ! CHECK:      [[ASYNC2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel async([[ASYNC2]] : i32) {
+! CHECK-NEXT:  acc.parallel async([[ASYNC2]] : i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -324,13 +324,13 @@ subroutine acc_parallel
 ! CHECK: acc.detach accPtr(%[[ATTACH_D]] : !fir.ptr<f32>) {dataClause = #acc<data_clause acc_attach>, name = "d"}
 ! CHECK: acc.detach accPtr(%[[ATTACH_E]] : !fir.ptr<f32>) {dataClause = #acc<data_clause acc_attach>, name = "e"}
 
-!$acc parallel private(a) firstprivate(b) private(c)
+!$acc parallel private(a) firstprivate(b) private(c) async(1)
 !$acc end parallel
 
-! CHECK:      %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {name = "a"}
-! CHECK:      %[[ACC_FPRIVATE_B:.*]] = acc.firstprivate varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {name = "b"}
-! CHECK:      %[[ACC_PRIVATE_C:.*]] = acc.private varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {name = "c"}
-! CHECK:      acc.parallel firstprivate(@firstprivatization_section_ext10xext10_ref_10x10xf32 -> %[[ACC_FPRIVATE_B]] : !fir.ref<!fir.array<10x10xf32>>) private(@privatization_ref_10x10xf32 -> %[[ACC_PRIVATE_A]] : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_10x10xf32 -> %[[ACC_PRIVATE_C]] : !fir.ref<!fir.array<10x10xf32>>) {
+! CHECK:      %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async([[ASYNC3:%.*]]) -> !fir.ref<!fir.array<10x10xf32>> {name = "a"}
+! CHECK:      %[[ACC_FPRIVATE_B:.*]] = acc.firstprivate varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async([[ASYNC3]]) -> !fir.ref<!fir.array<10x10xf32>> {name = "b"}
+! CHECK:      %[[ACC_PRIVATE_C:.*]] = acc.private varPtr(%[[DECLC]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async([[ASYNC3]]) -> !fir.ref<!fir.array<10x10xf32>> {name = "c"}
+! CHECK:      acc.parallel async([[ASYNC3]]) firstprivate(@firstprivatization_section_ext10xext10_ref_10x10xf32 -> %[[ACC_FPRIVATE_B]] : !fir.ref<!fir.array<10x10xf32>>) private(@privatization_ref_10x10xf32 -> %[[ACC_PRIVATE_A]] : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_10x10xf32 -> %[[ACC_PRIVATE_C]] : !fir.ref<!fir.array<10x10xf32>>) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenACC/acc-serial.f90 b/flang/test/Lower/OpenACC/acc-serial.f90
index c6fe6c3d58fe5..284f61976a46d 100644
--- a/flang/test/Lower/OpenACC/acc-serial.f90
+++ b/flang/test/Lower/OpenACC/acc-serial.f90
@@ -60,7 +60,7 @@ subroutine acc_serial
   !$acc serial async
   !$acc end serial
 
-! CHECK:      acc.serial {
+! CHECK: acc.serial {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} 
 
@@ -76,7 +76,7 @@ subroutine acc_serial
   !$acc end serial
 
 ! CHECK:      [[ASYNC2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.serial async([[ASYNC2]] : i32) {
+! CHECK-NEXT:      acc.serial async([[ASYNC2]] : i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenACC/acc-update.f90 b/flang/test/Lower/OpenACC/acc-update.f90
index bab21f82152b2..0964fd91457f9 100644
--- a/flang/test/Lower/OpenACC/acc-update.f90
+++ b/flang/test/Lower/OpenACC/acc-update.f90
@@ -60,9 +60,9 @@ subroutine acc_update
 ! CHECK: acc.update_host accPtr(%[[DEVPTR_B]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLB]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "b", structured = false}
 
   !$acc update host(a) async
-! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
+! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {asyncOnly = [#acc.device_type<none>], dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: acc.update async dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>)
-! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
+! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {asyncOnly = [#acc.device_type<none>], name = "a", structured = false}
 
   !$acc update host(a) wait
 ! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
@@ -70,32 +70,32 @@ subroutine acc_update
 ! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 
   !$acc update host(a) async wait
-! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
+! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {asyncOnly = [#acc.device_type<none>], dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: acc.update async wait dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>)
-! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
+! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {asyncOnly = [#acc.device_type<none>], name = "a", structured = false}
 
   !$acc update host(a) async(1)
-! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: [[ASYNC1:%.*]] = arith.constant 1 : i32
+! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async([[ASYNC1]] : i32) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: acc.update async([[ASYNC1]] : i32) dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>)
-! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
+! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async([[ASYNC1]] : i32) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 
   !$acc update host(a) async(async)
-! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: [[ASYNC2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async([[ASYNC2]] : i32) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: acc.update async([[ASYNC2]] : i32) dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>)
-! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
+! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) async([[ASYNC2]] : i32) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 
   !$acc update host(a) wait(1)
-! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: [[WAIT1:%.*]] = arith.constant 1 : i32
+! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: acc.update wait({[[WAIT1]] : i32}) dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>)
 ! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 
   !$acc update host(a) wait(queues: 1, 2)
-! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK: [[WAIT3:%.*]] = arith.constant 2 : i32
+! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: acc.update wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>)
 ! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 
@@ -105,8 +105,8 @@ subroutine acc_update
 ! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 
   !$acc update host(a) device_type(host, nvidia) async
-! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
+! CHECK: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {asyncOnly = [#acc.device_type<host>, #acc.device_type<nvidia>], dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! CHECK: acc.update async([#acc.device_type<host>, #acc.device_type<nvidia>]) dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>)
-! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
+! CHECK: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#0 : !fir.ref<!fir.array<10x10xf32>>) {asyncOnly = [#acc.device_type<host>, #acc.device_type<nvidia>], name = "a", structured = false}
 
 end subroutine acc_update
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index c53a437ac092b..8239367fdd3e7 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -101,6 +101,21 @@ mlir::Value getVarPtrPtr(mlir::Operation *accDataClauseOp);
 /// Returns an empty vector if there are no bounds.
 mlir::SmallVector<mlir::Value> getBounds(mlir::Operation *accDataClauseOp);
 
+/// Used to obtain `async` operands from an acc data clause operation.
+/// Returns an empty vector if there are no such operands.
+mlir::SmallVector<mlir::Value>
+getAsyncOperands(mlir::Operation *accDataClauseOp);
+
+/// Returns an array of acc:DeviceTypeAttr attributes attached to
+/// an acc data clause operation, that correspond to the device types
+/// associated with the async clauses with an async-value.
+mlir::ArrayAttr getAsyncOperandsDeviceType(mlir::Operation *accDataClauseOp);
+
+/// Returns an array of acc:DeviceTypeAttr attributes attached to
+/// an acc data clause operation, that correspond to the device types
+/// associated with the async clauses without an async-value.
+mlir::ArrayAttr getAsyncOnly(mlir::Operation *accDataClauseOp);
+
 /// Used to obtain the `name` from an acc operation.
 std::optional<llvm::StringRef> getVarName(mlir::Operation *accOp);
 
@@ -147,6 +162,11 @@ struct ConstructResource
   mlir::StringRef getName() final { return "AccConstructResource"; }
 };
 
+struct CurrentDeviceIdResource
+    : public mlir::SideEffects::Resource::Base<CurrentDeviceIdResource> {
+  mlir::StringRef getName() final { return "AccCurrentDeviceIdResource"; }
+};
+
 } // namespace acc
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 361ede110ed13..dc255e772841c 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -236,6 +236,43 @@ def OpenACC_CombinedConstructsAttr : EnumAttr<OpenACC_Dialect,
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
+def OpenACC_ParallelConstruct : I64EnumAttrCase<"acc_construct_parallel", 0>;
+def OpenACC_KernelsConstruct : I64EnumAttrCase<"acc_construct_kernels", 1>;
+def OpenACC_LoopConstruct : I64EnumAttrCase<"acc_construct_loop", 2>;
+def OpenACC_DataConstruct : I64EnumAttrCase<"acc_construct_data", 3>;
+def OpenACC_EnterDataConstruct : I64EnumAttrCase<"acc_construct_enter_data", 4>;
+def OpenACC_ExitDataConstruct : I64EnumAttrCase<"acc_construct_exit_data", 5>;
+def OpenACC_HostDataConstruct : I64EnumAttrCase<"acc_construct_host_data", 6>;
+def OpenACC_AtomicConstruct : I64EnumAttrCase<"acc_construct_atomic", 7>;
+def OpenACC_DeclareConstruct : I64EnumAttrCase<"acc_construct_declare", 8>;
+def OpenACC_InitConstruct : I64EnumAttrCase<"acc_construct_init", 9>;
+def OpenACC_ShutdownConstruct : I64EnumAttrCase<"acc_construct_shutdown", 10>;
+def OpenACC_SetConstruct : I64EnumAttrCase<"acc_construct_set", 11>;
+def OpenACC_UpdateConstruct : I64EnumAttrCase<"acc_construct_update", 12>;
+def OpenACC_RoutineConstruct : I64EnumAttrCase<"acc_construct_routine", 13>;
+def OpenACC_WaitConstruct : I64EnumAttrCase<"acc_construct_wait", 14>;
+def OpenACC_RuntimeAPIConstruct : I64EnumAttrCase<"acc_construct_runtime_api", 15>;
+def OpenACC_SerialConstruct : I64EnumAttrCase<"acc_construct_serial", 16>;
+
+def OpenACC_ConstructEnum : I64EnumAttr<"Construct",
+    "constructs supported by OpenACC",
+    [OpenACC_ParallelConstruct, OpenACC_KernelsConstruct,
+     OpenACC_LoopConstruct, OpenACC_DataConstruct,
+     OpenACC_EnterDataConstruct, OpenACC_ExitDataConstruct,
+     OpenACC_HostDataConstruct, OpenACC_AtomicConstruct,
+     OpenACC_DeclareConstruct, OpenACC_InitConstruct,
+     OpenACC_ShutdownConstruct, OpenACC_SetConstruct,
+     OpenACC_UpdateConstruct, OpenACC_RoutineConstruct,
+     OpenACC_WaitConstruct, OpenACC_RuntimeAPIConstruct,
+     OpenACC_SerialConstruct
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::acc";
+}
+
+def OpenACC_ConstructAttr : EnumAttr<OpenACC_Dialect, OpenACC_ConstructEnum,
+                                     "construct">;
+
 // Define a resource for the OpenACC runtime counters.
 def OpenACC_RuntimeCounters : Resource<"::mlir::acc::RuntimeCounters">;
 
@@ -245,6 +282,9 @@ def OpenACC_RuntimeCounters : Resource<"::mlir::acc::RuntimeCounters">;
 // `dataOperands` list).
 def OpenACC_ConstructResource : Resource<"::mlir::acc::ConstructResource">;
 
+// Define a resource for the OpenACC current device setting.
+def OpenACC_CurrentDeviceIdResource : Resource<"::mlir::acc::CurrentDeviceIdResource">;
+
 // Used for data specification in data clauses (2.7.1).
 // Either (or both) extent and upperbound must be specified.
 def OpenACC_DataBoundsOp : OpenACC_Op<"bounds",
@@ -320,10 +360,15 @@ def OpenACC_DataBoundsOp : OpenACC_Op<"bounds",
 class OpenACC_DataEntryOp<string mnemonic, string clause, string extraDescription,
                           list<Trait> traits = [], dag additionalArgs = (ins)> :
     OpenACC_Op<mnemonic, !listconcat(traits,
-        [AttrSizedOperandSegments])> {
+        [AttrSizedOperandSegments,
+         MemoryEffects<[MemRead<OpenACC_CurrentDeviceIdResource>]>])> {
   let arguments = !con(additionalArgs,
-                      (ins Optional<OpenACC_PointerLikeTypeInterface>:$varPtrPtr,
+                      (ins
+                       Optional<OpenACC_PointerLikeTypeInterface>:$varPtrPtr,
                        Variadic<OpenACC_DataBoundsType>:$bounds, /* rank-0 to rank-{n-1} */
+                       Variadic<IntOrIndex>:$asyncOperands,
+                       OptionalAttr<DeviceTypeArrayAttr>:$asyncOperandsDeviceType,
+                       OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
                        DefaultValuedAttr<OpenACC_DataClauseAttr,clause>:$dataClause,
                        DefaultValuedAttr<BoolAttr, "true">:$structured,
                        DefaultValuedAttr<BoolAttr, "false">:$implicit,
@@ -337,6 +382,10 @@ class OpenACC_DataEntryOp<string mnemonic, string clause, string extraDescriptio
     attach semantics on data clauses (2.6.4).
     - `bounds`: Used when copying just slice of array or array's bounds are not
     encoded in type. They are in rank order where rank 0 is inner-most dimension.
+    - `asyncOperands` and `asyncOperandsDeviceType`:
+    pair-wise lists of the async clause values associated with device_type's.
+    - `asyncOnly`: a list of device_type's for which async clause
+    does not specify a value (default is acc_async_noval - OpenACC 3.3 2.16.1).
     - `dataClause`: Keeps track of the data clause the user used. This is because
     the acc operations are decomposed. So a 'copy' clause is decomposed to both 
     `acc.copyin` and `acc.copyout` operations, but both have dataClause that
@@ -348,13 +397,54 @@ class OpenACC_DataEntryOp<string mnemonic, string clause, string extraDescriptio
     - `implicit`: Whether this is an implicitly generated operation, such as copies
     done to satisfy "Variables with Implicitly Determined Data Attributes" in 2.6.2.
     - `name`: Holds the name of variable as specified in user clause (including bounds).
+
+    The async values attached to the data entry operation imply that the data
+    action applies to all device types specified by the device_type clauses
+    using the activity queues on these devices as defined by the async values.
   }]);
 
+  code extraClassDeclarationBase = [{
+    /// Return true if the op has the async attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasAsyncOnly() {
+      return hasAsyncOnly(mlir::acc::DeviceType::None);
+    }
+    /// Return true if the op has the async attribute for the given device_type.
+    bool hasAsyncOnly(mlir::acc::DeviceType deviceType) {
+      for (auto attr : getAsyncOnlyAttr()) {
+        auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+        if (deviceTypeAttr.getValue() == deviceType)
+          return true;
+      }
+      return false;
+    }
+    /// Return the value of the async clause if present.
+    mlir::Value getAsyncValue() {
+      return getAsyncValue(mlir::acc::DeviceType::None);
+    }
+    /// Return the value of the async clause for the given device_type if
+    /// present.
+    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType) {
+      mlir::ArrayAttr deviceTypes = getAsyncOperandsDeviceTypeAttr();
+      if (!deviceTypes)
+        return nullptr;
+      for (auto [attr, asyncValue] :
+          llvm::zip(deviceTypes, getAsyncOperands())) {
+        auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+        if (deviceTypeAttr.getValue() == deviceType)
+          return asyncValue;
+      }
+      return nullptr;
+    }
+  }];
+
   let assemblyFormat = [{
     `varPtr` `(` $varPtr `:` type($varPtr) `)`
     oilist(
         `varPtrPtr` `(` $varPtrPtr `:` type($varPtrPtr) `)`
       | `bounds` `(` $bounds `)`
+      | `async` `(` custom<DeviceTypeOperands>($asyncOperands,
+            type($asyncOperands), $asyncOperandsDeviceType) `)`
     ) `->` type($accPtr) attr-dict
   }];
 
@@ -370,6 +460,7 @@ def OpenACC_PrivateOp : OpenACC_DataEntryOp<"private",
   let summary = "Represents private semantics for acc private clause.";
   let results = (outs Arg<OpenACC_PointerLikeTypeInterface,
                           "Address of device variable",[MemWrite]>:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -382,6 +473,7 @@ def OpenACC_FirstprivateOp : OpenACC_DataEntryOp<"firstprivate",
                 "clause.";
   let results = (outs Arg<OpenACC_PointerLikeTypeInterface,
                           "Address of device variable",[MemWrite]>:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -393,6 +485,7 @@ def OpenACC_ReductionOp : OpenACC_DataEntryOp<"reduction",
   let summary = "Represents reduction semantics for acc reduction clause.";
   let results = (outs Arg<OpenACC_PointerLikeTypeInterface,
                           "Address of device variable",[MemWrite]>:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -404,6 +497,7 @@ def OpenACC_DevicePtrOp : OpenACC_DataEntryOp<"deviceptr",
     (ins OpenACC_PointerLikeTypeInterface:$varPtr)> {
   let summary = "Specifies that the variable pointer is a device pointer.";
   let results = (outs OpenACC_PointerLikeTypeInterface:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -416,6 +510,7 @@ def OpenACC_PresentOp : OpenACC_DataEntryOp<"present",
     (ins OpenACC_PointerLikeTypeInterface:$varPtr)> {
   let summary = "Specifies that the variable is already present on device.";
   let results = (outs OpenACC_PointerLikeTypeInterface:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -431,7 +526,7 @@ def OpenACC_CopyinOp : OpenACC_DataEntryOp<"copyin",
   let results = (outs Arg<OpenACC_PointerLikeTypeInterface,
                           "Address of device variable",[MemWrite]>:$accPtr);
 
-  let extraClassDeclaration = [{
+  let extraClassDeclaration = extraClassDeclarationBase # [{
     /// Check if this is a copyin with readonly modifier.
     bool isCopyinReadonly();
   }];
@@ -450,7 +545,7 @@ def OpenACC_CreateOp : OpenACC_DataEntryOp<"create",
   let results = (outs Arg<OpenACC_PointerLikeTypeInterface,
                           "Address of device variable",[MemWrite]>:$accPtr);
 
-  let extraClassDeclaration = [{
+  let extraClassDeclaration = extraClassDeclarationBase # [{
     /// Check if this is a create with zero modifier.
     bool isCreateZero();
   }];
@@ -466,6 +561,7 @@ def OpenACC_NoCreateOp : OpenACC_DataEntryOp<"nocreate",
     (ins OpenACC_PointerLikeTypeInterface:$varPtr)> {
   let summary = "Represents acc no_create semantics.";
   let results = (outs OpenACC_PointerLikeTypeInterface:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -480,6 +576,7 @@ def OpenACC_AttachOp : OpenACC_DataEntryOp<"attach",
                 "device memory with the corresponding device address of the "
                 "pointee.";
   let results = (outs OpenACC_PointerLikeTypeInterface:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -502,6 +599,7 @@ def OpenACC_GetDevicePtrOp : OpenACC_DataEntryOp<"getdeviceptr",
   let summary = "Gets device address if variable exists on device.";
   let results = (outs OpenACC_PointerLikeTypeInterface:$accPtr);
   let hasVerifier = 0;
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -513,6 +611,7 @@ def OpenACC_UpdateDeviceOp : OpenACC_DataEntryOp<"update_device",
   let summary = "Represents acc update device semantics.";
   let results = (outs Arg<OpenACC_PointerLikeTypeInterface,
                           "Address of device variable",[MemWrite]>:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -524,6 +623,7 @@ def OpenACC_UseDeviceOp : OpenACC_DataEntryOp<"use_device",
     (ins OpenACC_PointerLikeTypeInterface:$varPtr)> {
   let summary = "Represents acc use_device semantics.";
   let results = (outs OpenACC_PointerLikeTypeInterface:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -535,6 +635,7 @@ def OpenACC_DeclareDeviceResidentOp : OpenACC_DataEntryOp<"declare_device_reside
     (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of variable",[MemRead]>:$varPtr)> {
   let summary = "Represents acc declare device_resident semantics.";
   let results = (outs OpenACC_PointerLikeTypeInterface:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -546,6 +647,7 @@ def OpenACC_DeclareLinkOp : OpenACC_DataEntryOp<"declare_link",
     (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of variable",[MemRead]>:$varPtr)> {
   let summary = "Represents acc declare link semantics.";
   let results = (outs OpenACC_PointerLikeTypeInterface:$accPtr);
+  let extraClassDeclaration = extraClassDeclarationBase;
 }
 
 //===----------------------------------------------------------------------===//
@@ -558,7 +660,7 @@ def OpenACC_CacheOp : OpenACC_DataEntryOp<"cache",
                 "loop.";
   let results = (outs OpenACC_PointerLikeTypeInterface:$accPtr);
 
-  let extraClassDeclaration = [{
+  let extraClassDeclaration = extraClassDeclarationBase # [{
     /// Check if this is a cache with readonly modifier.
     bool isCacheReadonly() {
       return getDataClause() == acc::DataClause::acc_cache_readonly;
@@ -572,9 +674,14 @@ def OpenACC_CacheOp : OpenACC_DataEntryOp<"cache",
 // operations for the following OpenACC data clauses: copyout, detach, delete.
 class OpenACC_DataExitOp<string mnemonic, string clause, string extraDescription,
                          list<Trait> traits = [], dag additionalArgs = (ins)> :
-    OpenACC_Op<mnemonic, !listconcat(traits, [])> {
+    OpenACC_Op<mnemonic, !listconcat(traits,
+        [AttrSizedOperandSegments,
+         MemoryEffects<[MemRead<OpenACC_CurrentDeviceIdResource>]>])> {
   let arguments = !con(additionalArgs,
                       (ins Variadic<OpenACC_DataBoundsType>:$bounds,
+                       Variadic<IntOrIndex>:$asyncOperands,
+                       OptionalAttr<DeviceTypeArrayAttr>:$asyncOperandsDeviceType,
+                       OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
                        DefaultValuedAttr<OpenACC_DataClauseAttr,clause>:$dataClause,
                        DefaultValuedAttr<BoolAttr, "true">:$structured,
                        DefaultValuedAttr<BoolAttr, "false">:$implicit,
@@ -585,6 +692,10 @@ class OpenACC_DataExitOp<string mnemonic, string clause, string extraDescription
     operation used.
     - `bounds`: Used when copying just slice of array or array's bounds are not
     encoded in type. They are in rank order where rank 0 is inner-most dimension.
+    - `asyncOperands` and `asyncOperandsDeviceType`:
+    pair-wise lists of the async clause values associated with device_type's.
+    - `asyncOnly`: a list of device_type's for which async clause
+    does not specify a value (default is acc_async_noval - OpenACC 3.3 2.16.1).
     - `dataClause`: Keeps track of the data clause the user used. This is because
     the acc operations are decomposed. So a 'copy' clause is decomposed to both 
     `acc.copyin` and `acc.copyout` operations, but both have dataClause that
@@ -596,13 +707,54 @@ class OpenACC_DataExitOp<string mnemonic, string clause, string extraDescription
     - `implicit`: Whether this is an implicitly generated operation, such as copies
     done to satisfy "Variables with Implicitly Determined Data Attributes" in 2.6.2.
     - `name`: Holds the name of variable as specified in user clause (including bounds).
+
+    The async values attached to the data exit operation imply that the data
+    action applies to all device types specified by the device_type clauses
+    using the activity queues on these devices as defined by the async values.
   }]);
 
+  code extraClassDeclarationBase = [{
+    /// Return true if the op has the async attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasAsyncOnly() {
+      return hasAsyncOnly(mlir::acc::DeviceType::None);
+    }
+    /// Return true if the op has the async attribute for the given device_type.
+    bool hasAsyncOnly(mlir::acc::DeviceType deviceType) {
+      for (auto attr : getAsyncOnlyAttr()) {
+        auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+        if (deviceTypeAttr.getValue() == deviceType)
+          return true;
+      }
+      return false;
+    }
+    /// Return the value of the async clause if present.
+    mlir::Value getAsyncValue() {
+      return getAsyncValue(mlir::acc::DeviceType::None);
+    }
+    /// Return the value of the async clause for the given device_type if
+    /// present.
+    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType) {
+      mlir::ArrayAttr deviceTypes = getAsyncOperandsDeviceTypeAttr();
+      if (!deviceTypes)
+        return nullptr;
+      for (auto [attr, asyncValue] :
+          llvm::zip(deviceTypes, getAsyncOperands())) {
+        auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+        if (deviceTypeAttr.getValue() == deviceType)
+          return asyncValue;
+      }
+      return nullptr;
+    }
+  }];
+
   let assemblyFormat = [{
     `accPtr` `(` $accPtr `:` type($accPtr) `)`
     oilist(
         `bounds` `(` $bounds `)`
       | `to` `varPtr` `(` $varPtr `:` type($varPtr) `)`
+      | `async` `(` custom<DeviceTypeOperands>($asyncOperands,
+            type($asyncOperands), $asyncOperandsDeviceType) `)`
     ) attr-dict
   }];
 
@@ -621,7 +773,7 @@ def OpenACC_CopyoutOp : OpenACC_DataExitOp<"copyout",
          Arg<OpenACC_PointerLikeTypeInterface,"Address of variable",[MemWrite]>:$varPtr)> {
   let summary = "Represents acc copyout semantics - reverse of copyin.";
 
-  let extraClassDeclaration = [{
+  let extraClassDeclaration = extraClassDeclarationBase # [{
     /// Check if this is a copyout with zero modifier.
     bool isCopyoutZero();
   }];
@@ -629,6 +781,8 @@ def OpenACC_CopyoutOp : OpenACC_DataExitOp<"copyout",
   let assemblyFormat = [{
     `accPtr` `(` $accPtr `:` type($accPtr) `)`
     (`bounds` `(` $bounds^ `)` )?
+    (`async` `(` custom<DeviceTypeOperands>($asyncOperands,
+            type($asyncOperands), $asyncOperandsDeviceType)^ `)`)?
     `to` `varPtr` `(` $varPtr `:` type($varPtr) `)`
     attr-dict
   }];
@@ -644,9 +798,13 @@ def OpenACC_DeleteOp : OpenACC_DataExitOp<"delete",
     (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of device variable",[MemRead]>:$accPtr)> {
   let summary = "Represents acc delete semantics - reverse of create.";
 
+  let extraClassDeclaration = extraClassDeclarationBase;
+
   let assemblyFormat = [{
     `accPtr` `(` $accPtr `:` type($accPtr) `)`
     (`bounds` `(` $bounds^ `)` )?
+    (`async` `(` custom<DeviceTypeOperands>($asyncOperands,
+            type($asyncOperands), $asyncOperandsDeviceType)^ `)`)?
     attr-dict
   }];
 }
@@ -661,9 +819,13 @@ def OpenACC_DetachOp : OpenACC_DataExitOp<"detach",
     (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of device variable",[MemRead]>:$accPtr)> {
   let summary = "Represents acc detach semantics - reverse of attach.";
 
+  let extraClassDeclaration = extraClassDeclarationBase;
+
   let assemblyFormat = [{
     `accPtr` `(` $accPtr `:` type($accPtr) `)`
     (`bounds` `(` $bounds^ `)` )?
+    (`async` `(` custom<DeviceTypeOperands>($asyncOperands,
+            type($asyncOperands), $asyncOperandsDeviceType)^ `)`)?
     attr-dict
   }];
 }
@@ -679,7 +841,7 @@ def OpenACC_UpdateHostOp : OpenACC_DataExitOp<"update_host",
     (ins Arg<OpenACC_PointerLikeTypeInterface,"Address of device variable",[MemRead]>:$accPtr,
          Arg<OpenACC_PointerLikeTypeInterface,"Address of variable",[MemWrite]>:$varPtr)> {
   let summary = "Represents acc update host semantics.";
-  let extraClassDeclaration = [{
+  let extraClassDeclaration = extraClassDeclarationBase # [{
     /// Check if this is an acc update self.
     bool isSelf() {
       return getDataClause() == acc::DataClause::acc_update_self;
@@ -689,6 +851,8 @@ def OpenACC_UpdateHostOp : OpenACC_DataExitOp<"update_host",
   let assemblyFormat = [{
     `accPtr` `(` $accPtr `:` type($accPtr) `)`
     (`bounds` `(` $bounds^ `)` )?
+    (`async` `(` custom<DeviceTypeOperands>($asyncOperands,
+            type($asyncOperands), $asyncOperandsDeviceType)^ `)`)?
     `to` `varPtr` `(` $varPtr `:` type($varPtr) `)`
     attr-dict
   }];
@@ -904,7 +1068,8 @@ def OpenACC_ReductionRecipeOp : OpenACC_Op<"reduction.recipe",
 
 def OpenACC_ParallelOp : OpenACC_Op<"parallel",
     [AttrSizedOperandSegments, RecursiveMemoryEffects,
-     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "parallel construct";
   let description = [{
     The "acc.parallel" operation represents a parallel construct block. It has
@@ -1009,6 +1174,9 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
     /// Return the wait devnum value clause for the given device_type if
     /// present.
     mlir::Value getWaitDevnum(mlir::acc::DeviceType deviceType);
+    static mlir::acc::Construct getConstructId() {
+      return mlir::acc::Construct::acc_construct_parallel;
+    }
   }];
 
   let assemblyFormat = [{
@@ -1050,7 +1218,8 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
 
 def OpenACC_SerialOp : OpenACC_Op<"serial",
     [AttrSizedOperandSegments, RecursiveMemoryEffects,
-     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "serial construct";
   let description = [{
     The "acc.serial" operation represents a serial construct block. It has
@@ -1127,6 +1296,9 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
     /// Return the wait devnum value clause for the given device_type if
     /// present.
     mlir::Value getWaitDevnum(mlir::acc::DeviceType deviceType);
+    static mlir::acc::Construct getConstructId() {
+      return mlir::acc::Construct::acc_construct_serial;
+    }
   }];
 
   let assemblyFormat = [{
@@ -1162,7 +1334,8 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
 
 def OpenACC_KernelsOp : OpenACC_Op<"kernels",
     [AttrSizedOperandSegments, RecursiveMemoryEffects,
-     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "kernels construct";
   let description = [{
     The "acc.kernels" operation represents a kernels construct block. It has
@@ -1262,6 +1435,9 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
     /// Return the wait devnum value clause for the given device_type if
     /// present.
     mlir::Value getWaitDevnum(mlir::acc::DeviceType deviceType);
+    static mlir::acc::Construct getConstructId() {
+      return mlir::acc::Construct::acc_construct_kernels;
+    }
   }];
 
   let assemblyFormat = [{
@@ -1294,7 +1470,8 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
 
 def OpenACC_DataOp : OpenACC_Op<"data",
     [AttrSizedOperandSegments, RecursiveMemoryEffects,
-     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "data construct";
 
   let description = [{
@@ -1402,7 +1579,9 @@ def OpenACC_TerminatorOp : OpenACC_Op<"terminator", [Pure, Terminator]> {
 //===----------------------------------------------------------------------===//
 
 def OpenACC_EnterDataOp : OpenACC_Op<"enter_data",
-    [AttrSizedOperandSegments, MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+    [AttrSizedOperandSegments,
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "enter data operation";
 
   let description = [{
@@ -1451,7 +1630,9 @@ def OpenACC_EnterDataOp : OpenACC_Op<"enter_data",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_ExitDataOp : OpenACC_Op<"exit_data",
-    [AttrSizedOperandSegments, MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+    [AttrSizedOperandSegments,
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "exit data operation";
 
   let description = [{
@@ -1501,7 +1682,9 @@ def OpenACC_ExitDataOp : OpenACC_Op<"exit_data",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_HostDataOp : OpenACC_Op<"host_data",
-    [AttrSizedOperandSegments, MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+    [AttrSizedOperandSegments,
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "host_data construct";
 
   let description = [{
@@ -1890,7 +2073,8 @@ def AtomicCaptureOp : OpenACC_Op<"atomic.capture",
 //===----------------------------------------------------------------------===//
 
 def OpenACC_DeclareEnterOp : OpenACC_Op<"declare_enter",
-    [MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+    [MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "declare directive - entry to implicit data region";
 
   let description = [{
@@ -1920,7 +2104,9 @@ def OpenACC_DeclareEnterOp : OpenACC_Op<"declare_enter",
 }
 
 def OpenACC_DeclareExitOp : OpenACC_Op<"declare_exit",
-    [AttrSizedOperandSegments, MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+    [AttrSizedOperandSegments,
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "declare directive - exit from implicit data region";
 
   let description = [{
@@ -2022,7 +2208,8 @@ def OpenACC_GlobalDestructorOp : OpenACC_Op<"global_dtor",
 }
 
 def OpenACC_DeclareOp : OpenACC_Op<"declare",
-    [RecursiveMemoryEffects, MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+    [RecursiveMemoryEffects,
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
   let summary = "declare implicit region";
 
   let description = [{
@@ -2227,7 +2414,8 @@ def OpenACC_ShutdownOp : OpenACC_Op<"shutdown", [AttrSizedOperandSegments]> {
 // 2.14.3. Set
 //===----------------------------------------------------------------------===//
 
-def OpenACC_SetOp : OpenACC_Op<"set", [AttrSizedOperandSegments]> {
+def OpenACC_SetOp : OpenACC_Op<"set", [AttrSizedOperandSegments,
+    MemoryEffects<[MemWrite<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "set operation";
 
   let description = [{
@@ -2259,7 +2447,9 @@ def OpenACC_SetOp : OpenACC_Op<"set", [AttrSizedOperandSegments]> {
 //===----------------------------------------------------------------------===//
 
 def OpenACC_UpdateOp : OpenACC_Op<"update",
-    [AttrSizedOperandSegments, MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+    [AttrSizedOperandSegments,
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>,
+                    MemRead<OpenACC_CurrentDeviceIdResource>]>]> {
   let summary = "update operation";
 
   let description = [{
@@ -2362,6 +2552,11 @@ def OpenACC_WaitOp : OpenACC_Op<"wait", [AttrSizedOperandSegments]> {
     acc.wait(%value1: index)
     acc.wait() async(%async1: i32)
     ```
+
+    acc.wait does not implement MemoryEffects interface,
+    so it affects all the resources. This is conservatively
+    correct. More precise modelling of the memory effects
+    seems to be impossible without the whole program analysis.
   }];
 
   let arguments = (ins Variadic<IntOrIndex>:$waitOperands,
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 1c8ce1ca3bce3..01305898f252d 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2880,6 +2880,36 @@ mlir::acc::getBounds(mlir::Operation *accDataClauseOp) {
   return bounds;
 }
 
+mlir::SmallVector<mlir::Value>
+mlir::acc::getAsyncOperands(mlir::Operation *accDataClauseOp) {
+  return llvm::TypeSwitch<mlir::Operation *, mlir::SmallVector<mlir::Value>>(
+             accDataClauseOp)
+      .Case<ACC_DATA_ENTRY_OPS, ACC_DATA_EXIT_OPS>([&](auto dataClause) {
+        return mlir::SmallVector<mlir::Value>(
+            dataClause.getAsyncOperands().begin(),
+            dataClause.getAsyncOperands().end());
+      })
+      .Default([&](mlir::Operation *) {
+        return mlir::SmallVector<mlir::Value, 0>();
+      });
+}
+
+mlir::ArrayAttr
+mlir::acc::getAsyncOperandsDeviceType(mlir::Operation *accDataClauseOp) {
+  return llvm::TypeSwitch<mlir::Operation *, mlir::ArrayAttr>(accDataClauseOp)
+      .Case<ACC_DATA_ENTRY_OPS, ACC_DATA_EXIT_OPS>([&](auto dataClause) {
+        return dataClause.getAsyncOperandsDeviceTypeAttr();
+      })
+      .Default([&](mlir::Operation *) { return mlir::ArrayAttr{}; });
+}
+
+mlir::ArrayAttr mlir::acc::getAsyncOnly(mlir::Operation *accDataClauseOp) {
+  return llvm::TypeSwitch<mlir::Operation *, mlir::ArrayAttr>(accDataClauseOp)
+      .Case<ACC_DATA_ENTRY_OPS, ACC_DATA_EXIT_OPS>(
+          [&](auto dataClause) { return dataClause.getAsyncOnlyAttr(); })
+      .Default([&](mlir::Operation *) { return mlir::ArrayAttr{}; });
+}
+
 std::optional<llvm::StringRef> mlir::acc::getVarName(mlir::Operation *accOp) {
   auto name{
       llvm::TypeSwitch<mlir::Operation *, std::optional<llvm::StringRef>>(accOp)



More information about the flang-commits mailing list