[llvm-branch-commits] [flang] [flang][OpenMP] Extend `do concurrent` mapping to device (PR #155987)

Sun Aug 31 23:26:40 PDT 2025

https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/155987

>From 0373863b919e59130dcf57593f4283ece0dff12a Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Fri, 29 Aug 2025 02:04:49 -0500
Subject: [PATCH] [flang][OpenMP] Extend `do concurrent` mapping to device

Upstreams further parts of `do concurrent` to OpenMP conversion pass
from AMD's fork. This PR extends the pass by adding support for mapping
to the device.
---
 flang/lib/Optimizer/OpenMP/CMakeLists.txt     |   1 +
 .../OpenMP/DoConcurrentConversion.cpp         | 400 +++++++++++++++++-
 .../Transforms/DoConcurrent/basic_device.f90  |  83 ++++
 .../Transforms/DoConcurrent/basic_device.mlir |  10 +-
 4 files changed, 476 insertions(+), 18 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/basic_device.f90

diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index e0aebd0714c8f..b85ee7e861a4f 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -26,6 +26,7 @@ add_flang_library(FlangOpenMPTransforms
   FIRSupport
   FortranSupport
   HLFIRDialect
+  FortranUtils
 
   MLIR_DEPS
   ${dialect_libs}
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index c928b76065ade..e975b86a6ba0d 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -6,17 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "flang/Optimizer/Builder/DirectivesCommon.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/OpenMP/Utils.h"
 #include "flang/Support/OpenMP-utils.h"
+#include "flang/Utils/OpenMP.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
 
 namespace flangomp {
 #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
@@ -107,6 +112,33 @@ struct InductionVariableInfo {
 
 using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>;
 
+/// Collect the list of values used inside the loop but defined outside of it.
+void collectLoopLiveIns(fir::DoConcurrentLoopOp loop,
+                        llvm::SmallVectorImpl<mlir::Value> &liveIns) {
+  llvm::SmallDenseSet<mlir::Value> seenValues;
+  llvm::SmallDenseSet<mlir::Operation *> seenOps;
+
+  for (auto [lb, ub, st] : llvm::zip_equal(
+           loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
+    liveIns.push_back(lb);
+    liveIns.push_back(ub);
+    liveIns.push_back(st);
+  }
+
+  mlir::visitUsedValuesDefinedAbove(
+      loop.getRegion(), [&](mlir::OpOperand *operand) {
+        if (!seenValues.insert(operand->get()).second)
+          return;
+
+        mlir::Operation *definingOp = operand->get().getDefiningOp();
+        // We want to collect ops corresponding to live-ins only once.
+        if (definingOp && !seenOps.insert(definingOp).second)
+          return;
+
+        liveIns.push_back(operand->get());
+      });
+}
+
 /// Collects values that are local to a loop: "loop-local values". A loop-local
 /// value is one that is used exclusively inside the loop but allocated outside
 /// of it. This usually corresponds to temporary values that are used inside the
@@ -182,10 +214,6 @@ class DoConcurrentConversion
   mlir::LogicalResult
   matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
-    if (mapToDevice)
-      return doLoop.emitError(
-          "not yet implemented: Mapping `do concurrent` loops to device");
-
     looputils::InductionVariableInfos ivInfos;
     auto loop = mlir::cast<fir::DoConcurrentLoopOp>(
         doLoop.getRegion().back().getTerminator());
@@ -196,20 +224,72 @@ class DoConcurrentConversion
     for (mlir::Value indVar : *indVars)
       ivInfos.emplace_back(loop, indVar);
 
+    llvm::SmallVector<mlir::Value> loopNestLiveIns;
+    looputils::collectLoopLiveIns(loop, loopNestLiveIns);
+    assert(!loopNestLiveIns.empty());
+
     llvm::SetVector<mlir::Value> locals;
     looputils::collectLoopLocalValues(loop, locals);
 
+    // We do not want to map "loop-local" values to the device through
+    // `omp.map.info` ops. Therefore, we remove them from the list of live-ins.
+    loopNestLiveIns.erase(llvm::remove_if(loopNestLiveIns,
+                                          [&](mlir::Value liveIn) {
+                                            return locals.contains(liveIn);
+                                          }),
+                          loopNestLiveIns.end());
+
+    mlir::omp::TargetOp targetOp;
+    mlir::omp::LoopNestOperands loopNestClauseOps;
+
     mlir::IRMapping mapper;
+
+    if (mapToDevice) {
+      mlir::ModuleOp module = doLoop->getParentOfType<mlir::ModuleOp>();
+      bool isTargetDevice =
+          llvm::cast<mlir::omp::OffloadModuleInterface>(*module)
+              .getIsTargetDevice();
+
+      mlir::omp::TargetOperands targetClauseOps;
+      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
+                           loopNestClauseOps,
+                           isTargetDevice ? nullptr : &targetClauseOps);
+
+      LiveInShapeInfoMap liveInShapeInfoMap;
+      fir::FirOpBuilder builder(
+          rewriter,
+          fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));
+
+      for (mlir::Value liveIn : loopNestLiveIns) {
+        targetClauseOps.mapVars.push_back(
+            genMapInfoOpForLiveIn(builder, liveIn));
+        liveInShapeInfoMap.insert(
+            {liveIn, TargetDeclareShapeCreationInfo(liveIn)});
+      }
+
+      targetOp =
+          genTargetOp(doLoop.getLoc(), rewriter, mapper, loopNestLiveIns,
+                      targetClauseOps, loopNestClauseOps, liveInShapeInfoMap);
+      genTeamsOp(doLoop.getLoc(), rewriter);
+    }
+
     mlir::omp::ParallelOp parallelOp =
         genParallelOp(doLoop.getLoc(), rewriter, ivInfos, mapper);
-    mlir::omp::LoopNestOperands loopNestClauseOps;
-    genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
-                         loopNestClauseOps);
+
+    // Only set as composite when part of `distribute parallel do`.
+    parallelOp.setComposite(mapToDevice);
+
+    if (!mapToDevice)
+      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, mapper,
+                           loopNestClauseOps);
 
     for (mlir::Value local : locals)
       looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
                                         rewriter);
 
+    if (mapToDevice)
+      genDistributeOp(doLoop.getLoc(), rewriter).setComposite(/*val=*/true);
+
     mlir::omp::LoopNestOp ompLoopNest =
         genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
                     /*isComposite=*/mapToDevice);
@@ -244,6 +324,51 @@ class DoConcurrentConversion
   }
 
 private:
+  struct TargetDeclareShapeCreationInfo {
+    // Note: We use `std::vector` (rather than `llvm::SmallVector` as usual) to
+    // interface more easily `ShapeShiftOp::getOrigins()` which returns
+    // `std::vector`.
+    std::vector<mlir::Value> startIndices{};
+    std::vector<mlir::Value> extents{};
+
+    TargetDeclareShapeCreationInfo(mlir::Value liveIn) {
+      mlir::Value shape = nullptr;
+      mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
+      auto declareOp =
+          mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);
+
+      if (declareOp != nullptr)
+        shape = declareOp.getShape();
+
+      if (shape == nullptr)
+        return;
+
+      auto shapeOp =
+          mlir::dyn_cast_if_present<fir::ShapeOp>(shape.getDefiningOp());
+      auto shapeShiftOp =
+          mlir::dyn_cast_if_present<fir::ShapeShiftOp>(shape.getDefiningOp());
+
+      if (shapeOp == nullptr && shapeShiftOp == nullptr)
+        TODO(liveIn.getLoc(),
+             "Shapes not defined by `fir.shape` or `fir.shape_shift` op's are"
+             "not supported yet.");
+
+      if (shapeShiftOp != nullptr)
+        startIndices = shapeShiftOp.getOrigins();
+
+      extents = shapeOp != nullptr
+                    ? std::vector<mlir::Value>(shapeOp.getExtents().begin(),
+                                               shapeOp.getExtents().end())
+                    : shapeShiftOp.getExtents();
+    }
+
+    bool isShapedValue() const { return !extents.empty(); }
+    bool isShapeShiftedValue() const { return !startIndices.empty(); }
+  };
+
+  using LiveInShapeInfoMap =
+      llvm::DenseMap<mlir::Value, TargetDeclareShapeCreationInfo>;
+
   mlir::omp::ParallelOp
   genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
                 looputils::InductionVariableInfos &ivInfos,
@@ -284,11 +409,11 @@ class DoConcurrentConversion
     return result;
   }
 
-  void
-  genLoopNestClauseOps(mlir::Location loc,
-                       mlir::ConversionPatternRewriter &rewriter,
-                       fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
-                       mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+  void genLoopNestClauseOps(
+      mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+      fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
+      mlir::omp::LoopNestOperands &loopNestClauseOps,
+      mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
     assert(loopNestClauseOps.loopLowerBounds.empty() &&
            "Loop nest bounds were already emitted!");
 
@@ -297,11 +422,19 @@ class DoConcurrentConversion
       bounds.push_back(var.getDefiningOp()->getResult(0));
     };
 
+    auto hostEvalCapture = [&](mlir::Value var,
+                               llvm::SmallVectorImpl<mlir::Value> &bounds) {
+      populateBounds(var, bounds);
+
+      if (targetClauseOps)
+        targetClauseOps->hostEvalVars.push_back(var);
+    };
+
     for (auto [lb, ub, st] : llvm::zip_equal(
              loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
-      populateBounds(lb, loopNestClauseOps.loopLowerBounds);
-      populateBounds(ub, loopNestClauseOps.loopUpperBounds);
-      populateBounds(st, loopNestClauseOps.loopSteps);
+      hostEvalCapture(lb, loopNestClauseOps.loopLowerBounds);
+      hostEvalCapture(ub, loopNestClauseOps.loopUpperBounds);
+      hostEvalCapture(st, loopNestClauseOps.loopSteps);
     }
 
     loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
@@ -439,6 +572,243 @@ class DoConcurrentConversion
     return loopNestOp;
   }
 
+  void genBoundsOps(fir::FirOpBuilder &builder, mlir::Value liveIn,
+                    mlir::Value rawAddr,
+                    llvm::SmallVectorImpl<mlir::Value> &boundsOps) const {
+    fir::ExtendedValue extVal =
+        hlfir::translateToExtendedValue(rawAddr.getLoc(), builder,
+                                        hlfir::Entity{liveIn},
+                                        /*contiguousHint=*/
+                                        true)
+            .first;
+    fir::factory::AddrAndBoundsInfo info = fir::factory::getDataOperandBaseAddr(
+        builder, rawAddr, /*isOptional=*/false, rawAddr.getLoc());
+    boundsOps = fir::factory::genImplicitBoundsOps<mlir::omp::MapBoundsOp,
+                                                   mlir::omp::MapBoundsType>(
+        builder, info, extVal,
+        /*dataExvIsAssumedSize=*/false, rawAddr.getLoc());
+  }
+
+  mlir::omp::MapInfoOp genMapInfoOpForLiveIn(fir::FirOpBuilder &builder,
+                                             mlir::Value liveIn) const {
+    mlir::Value rawAddr = liveIn;
+    llvm::StringRef name;
+
+    mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
+    auto declareOp =
+        mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);
+
+    if (declareOp != nullptr) {
+      // Use the raw address to avoid unboxing `fir.box` values whenever
+      // possible. Put differently, if we have access to the direct value memory
+      // reference/address, we use it.
+      rawAddr = declareOp.getOriginalBase();
+      name = declareOp.getUniqName();
+    }
+
+    if (!llvm::isa<mlir::omp::PointerLikeType>(rawAddr.getType())) {
+      builder.setInsertionPointAfter(liveInDefiningOp);
+      auto copyVal = builder.createTemporary(liveIn.getLoc(), liveIn.getType());
+      builder.createStoreWithConvert(copyVal.getLoc(), liveIn, copyVal);
+      rawAddr = copyVal;
+    }
+
+    mlir::Type liveInType = liveIn.getType();
+    mlir::Type eleType = liveInType;
+    if (auto refType = mlir::dyn_cast<fir::ReferenceType>(liveInType))
+      eleType = refType.getElementType();
+
+    llvm::omp::OpenMPOffloadMappingFlags mapFlag =
+        llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+    mlir::omp::VariableCaptureKind captureKind =
+        mlir::omp::VariableCaptureKind::ByRef;
+
+    if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
+      captureKind = mlir::omp::VariableCaptureKind::ByCopy;
+    } else if (!fir::isa_builtin_cptr_type(eleType)) {
+      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+    }
+
+    llvm::SmallVector<mlir::Value> boundsOps;
+    genBoundsOps(builder, liveIn, rawAddr, boundsOps);
+
+    return Fortran::utils::openmp::createMapInfoOp(
+        builder, liveIn.getLoc(), rawAddr,
+        /*varPtrPtr=*/{}, name.str(), boundsOps,
+        /*members=*/{},
+        /*membersIndex=*/mlir::ArrayAttr{},
+        static_cast<
+            std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
+            mapFlag),
+        captureKind, rawAddr.getType());
+  }
+
+  mlir::omp::TargetOp
+  genTargetOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+              mlir::IRMapping &mapper, llvm::ArrayRef<mlir::Value> mappedVars,
+              mlir::omp::TargetOperands &clauseOps,
+              mlir::omp::LoopNestOperands &loopNestClauseOps,
+              const LiveInShapeInfoMap &liveInShapeInfoMap) const {
+    auto targetOp = rewriter.create<mlir::omp::TargetOp>(loc, clauseOps);
+    auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);
+
+    mlir::Region &region = targetOp.getRegion();
+
+    llvm::SmallVector<mlir::Type> regionArgTypes;
+    llvm::SmallVector<mlir::Location> regionArgLocs;
+
+    for (auto var : llvm::concat<const mlir::Value>(clauseOps.hostEvalVars,
+                                                    clauseOps.mapVars)) {
+      regionArgTypes.push_back(var.getType());
+      regionArgLocs.push_back(var.getLoc());
+    }
+
+    rewriter.createBlock(&region, {}, regionArgTypes, regionArgLocs);
+    fir::FirOpBuilder builder(
+        rewriter,
+        fir::getKindMapping(targetOp->getParentOfType<mlir::ModuleOp>()));
+
+    // Within the loop, it possible that we discover other values that need to
+    // mapped to the target region (the shape info values for arrays, for
+    // example). Therefore, the map block args might be extended and resized.
+    // Hence, we invoke `argIface.getMapBlockArgs()` every iteration to make
+    // sure we access the proper vector of data.
+    int idx = 0;
+    for (auto [mapInfoOp, mappedVar] :
+         llvm::zip_equal(clauseOps.mapVars, mappedVars)) {
+      auto miOp = mlir::cast<mlir::omp::MapInfoOp>(mapInfoOp.getDefiningOp());
+      hlfir::DeclareOp liveInDeclare =
+          genLiveInDeclare(builder, targetOp, argIface.getMapBlockArgs()[idx],
+                           miOp, liveInShapeInfoMap.at(mappedVar));
+      ++idx;
+
+      // TODO If `mappedVar.getDefiningOp()` is a `fir::BoxAddrOp`, we probably
+      // need to "unpack" the box by getting the defining op of it's value.
+      // However, we did not hit this case in reality yet so leaving it as a
+      // todo for now.
+
+      auto mapHostValueToDevice = [&](mlir::Value hostValue,
+                                      mlir::Value deviceValue) {
+        if (!llvm::isa<mlir::omp::PointerLikeType>(hostValue.getType()))
+          mapper.map(hostValue,
+                     builder.loadIfRef(hostValue.getLoc(), deviceValue));
+        else
+          mapper.map(hostValue, deviceValue);
+      };
+
+      mapHostValueToDevice(mappedVar, liveInDeclare.getOriginalBase());
+
+      if (auto origDeclareOp = mlir::dyn_cast_if_present<hlfir::DeclareOp>(
+              mappedVar.getDefiningOp()))
+        mapHostValueToDevice(origDeclareOp.getBase(), liveInDeclare.getBase());
+    }
+
+    for (auto [arg, hostEval] : llvm::zip_equal(argIface.getHostEvalBlockArgs(),
+                                                clauseOps.hostEvalVars))
+      mapper.map(hostEval, arg);
+
+    for (unsigned i = 0; i < loopNestClauseOps.loopLowerBounds.size(); ++i) {
+      loopNestClauseOps.loopLowerBounds[i] =
+          mapper.lookup(loopNestClauseOps.loopLowerBounds[i]);
+      loopNestClauseOps.loopUpperBounds[i] =
+          mapper.lookup(loopNestClauseOps.loopUpperBounds[i]);
+      loopNestClauseOps.loopSteps[i] =
+          mapper.lookup(loopNestClauseOps.loopSteps[i]);
+    }
+
+    // Check if cloning the bounds introduced any dependency on the outer
+    // region. If so, then either clone them as well if they are
+    // MemoryEffectFree, or else copy them to a new temporary and add them to
+    // the map and block_argument lists and replace their uses with the new
+    // temporary.
+    Fortran::utils::openmp::cloneOrMapRegionOutsiders(builder, targetOp);
+    rewriter.setInsertionPoint(
+        rewriter.create<mlir::omp::TerminatorOp>(targetOp.getLoc()));
+
+    return targetOp;
+  }
+
+  hlfir::DeclareOp genLiveInDeclare(
+      fir::FirOpBuilder &builder, mlir::omp::TargetOp targetOp,
+      mlir::Value liveInArg, mlir::omp::MapInfoOp liveInMapInfoOp,
+      const TargetDeclareShapeCreationInfo &targetShapeCreationInfo) const {
+    mlir::Type liveInType = liveInArg.getType();
+    std::string liveInName = liveInMapInfoOp.getName().has_value()
+                                 ? liveInMapInfoOp.getName().value().str()
+                                 : std::string("");
+    if (fir::isa_ref_type(liveInType))
+      liveInType = fir::unwrapRefType(liveInType);
+
+    mlir::Value shape = [&]() -> mlir::Value {
+      if (!targetShapeCreationInfo.isShapedValue())
+        return {};
+
+      llvm::SmallVector<mlir::Value> extentOperands;
+      llvm::SmallVector<mlir::Value> startIndexOperands;
+
+      if (targetShapeCreationInfo.isShapeShiftedValue()) {
+        llvm::SmallVector<mlir::Value> shapeShiftOperands;
+
+        size_t shapeIdx = 0;
+        for (auto [startIndex, extent] :
+             llvm::zip_equal(targetShapeCreationInfo.startIndices,
+                             targetShapeCreationInfo.extents)) {
+          shapeShiftOperands.push_back(
+              Fortran::utils::openmp::mapTemporaryValue(
+                  builder, targetOp, startIndex,
+                  liveInName + ".start_idx.dim" + std::to_string(shapeIdx)));
+          shapeShiftOperands.push_back(
+              Fortran::utils::openmp::mapTemporaryValue(
+                  builder, targetOp, extent,
+                  liveInName + ".extent.dim" + std::to_string(shapeIdx)));
+          ++shapeIdx;
+        }
+
+        auto shapeShiftType = fir::ShapeShiftType::get(
+            builder.getContext(), shapeShiftOperands.size() / 2);
+        return builder.create<fir::ShapeShiftOp>(
+            liveInArg.getLoc(), shapeShiftType, shapeShiftOperands);
+      }
+
+      llvm::SmallVector<mlir::Value> shapeOperands;
+      size_t shapeIdx = 0;
+      for (auto extent : targetShapeCreationInfo.extents) {
+        shapeOperands.push_back(Fortran::utils::openmp::mapTemporaryValue(
+            builder, targetOp, extent,
+            liveInName + ".extent.dim" + std::to_string(shapeIdx)));
+        ++shapeIdx;
+      }
+
+      return builder.create<fir::ShapeOp>(liveInArg.getLoc(), shapeOperands);
+    }();
+
+    return builder.create<hlfir::DeclareOp>(liveInArg.getLoc(), liveInArg,
+                                            liveInName, shape);
+  }
+
+  mlir::omp::TeamsOp
+  genTeamsOp(mlir::Location loc,
+             mlir::ConversionPatternRewriter &rewriter) const {
+    auto teamsOp = rewriter.create<mlir::omp::TeamsOp>(
+        loc, /*clauses=*/mlir::omp::TeamsOperands{});
+
+    rewriter.createBlock(&teamsOp.getRegion());
+    rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+
+    return teamsOp;
+  }
+
+  mlir::omp::DistributeOp
+  genDistributeOp(mlir::Location loc,
+                  mlir::ConversionPatternRewriter &rewriter) const {
+    auto distOp = rewriter.create<mlir::omp::DistributeOp>(
+        loc, /*clauses=*/mlir::omp::DistributeOperands{});
+
+    rewriter.createBlock(&distOp.getRegion());
+    return distOp;
+  }
+
   bool mapToDevice;
   llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
   mlir::SymbolTable &moduleSymbolTable;
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90
new file mode 100644
index 0000000000000..7bce696387646
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/basic_device.f90
@@ -0,0 +1,83 @@
+! Tests mapping of a basic `do concurrent` loop to
+! `!$omp target teams distribute parallel do`.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+
+program do_concurrent_basic
+    implicit none
+    integer :: a(10)
+    integer :: i
+
+    ! CHECK-DAG: %[[I_ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "i"}
+    ! CHECK: %[[I_ORIG_DECL:.*]]:2 = hlfir.declare %[[I_ORIG_ALLOC]]
+
+    ! CHECK-DAG: %[[A_ADDR:.*]] = fir.address_of(@_QFEa)
+    ! CHECK: %[[A_SHAPE:.*]] = fir.shape %[[A_EXTENT:.*]] : (index) -> !fir.shape<1>
+    ! CHECK: %[[A_ORIG_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]](%[[A_SHAPE]])
+
+    ! CHECK-NOT: fir.do_loop
+
+    ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+    ! CHECK: %[[HOST_LB:.*]] = fir.convert %[[C1]] : (i32) -> index
+    ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
+    ! CHECK: %[[HOST_UB:.*]] = fir.convert %[[C10]] : (i32) -> index
+    ! CHECK: %[[HOST_STEP:.*]] = arith.constant 1 : index
+
+    ! CHECK-DAG: %[[I_MAP_INFO:.*]] = omp.map.info var_ptr(%[[I_ORIG_DECL]]#1
+    ! CHECK: %[[C0:.*]] = arith.constant 0 : index
+    ! CHECK: %[[UPPER_BOUND:.*]] = arith.subi %[[A_EXTENT]], %{{c1.*}} : index
+
+    ! CHECK: %[[A_BOUNDS:.*]] = omp.map.bounds lower_bound(%[[C0]] : index)
+    ! CHECK-SAME: upper_bound(%[[UPPER_BOUND]] : index)
+    ! CHECK-SAME: extent(%[[A_EXTENT]] : index)
+
+    ! CHECK-DAG: %[[A_MAP_INFO:.*]] = omp.map.info var_ptr(%[[A_ORIG_DECL]]#1 : {{[^(]+}})
+    ! CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[A_BOUNDS]])
+
+    ! CHECK: omp.target
+    ! CHECK-SAME: host_eval(%[[HOST_LB]] -> %[[LB:[[:alnum:]]+]], %[[HOST_UB]] -> %[[UB:[[:alnum:]]+]], %[[HOST_STEP]] -> %[[STEP:[[:alnum:]]+]] : index, index, index)
+    ! CHECK-SAME: map_entries(
+    ! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+    ! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+    ! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+    ! CHECK-SAME:     %[[I_MAP_INFO]] -> %[[I_ARG:[[:alnum:]]+]],
+    ! CHECK-SAME:             %[[A_MAP_INFO]] -> %[[A_ARG:.[[:alnum:]]+]]
+
+    ! CHECK: %[[A_DEV_DECL:.*]]:2 = hlfir.declare %[[A_ARG]]
+    ! CHECK: omp.teams {
+    ! CHECK-NEXT: omp.parallel {
+
+    ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
+    ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+    ! CHECK-NEXT: omp.distribute {
+    ! CHECK-NEXT: omp.wsloop {
+
+    ! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+    ! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+    ! CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#0 : !fir.ref<i32>
+    ! CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+    ! CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+    ! CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64
+    ! CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[A_DEV_DECL]]#0 (%[[IV_VAL_I64]])  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+    ! CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref<i32>
+    ! CHECK-NEXT: omp.yield
+    ! CHECK-NEXT: }
+
+    ! CHECK-NEXT: } {omp.composite}
+    ! CHECK-NEXT: } {omp.composite}
+    ! CHECK-NEXT: omp.terminator
+    ! CHECK-NEXT: } {omp.composite}
+    ! CHECK-NEXT: omp.terminator
+    ! CHECK-NEXT: }
+    ! CHECK-NEXT: omp.terminator
+    ! CHECK-NEXT: }
+    do concurrent (i=1:10)
+        a(i) = i
+    end do
+
+    ! CHECK-NOT: fir.do_loop
+end program do_concurrent_basic
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.mlir b/flang/test/Transforms/DoConcurrent/basic_device.mlir
index 0ca48943864c8..fa511c3d46d58 100644
--- a/flang/test/Transforms/DoConcurrent/basic_device.mlir
+++ b/flang/test/Transforms/DoConcurrent/basic_device.mlir
@@ -1,4 +1,4 @@
-// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" -verify-diagnostics %s
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=device" %s -o - | FileCheck %s
 
 func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
     %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
@@ -11,8 +11,12 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas
     %8 = fir.convert %c10_i32 : (i32) -> index
     %c1 = arith.constant 1 : index
 
-    // expected-error at +2 {{not yet implemented: Mapping `do concurrent` loops to device}}
-    // expected-error at below {{failed to legalize operation 'fir.do_concurrent'}}
+    // CHECK: omp.target
+    // CHECK: omp.teams
+    // CHECK: omp.parallel
+    // CHECK: omp.distribute
+    // CHECK: omp.wsloop
+    // CHECK: omp.loop_nest
     fir.do_concurrent {
       %0 = fir.alloca i32 {bindc_name = "i"}
       %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)