[flang-commits] [clang] [flang] [flang][OpenMP] Add -f[no]-openmp-simd (PR #150269)

Wed Jul 23 10:13:34 PDT 2025

https://github.com/mrkajetanp created https://github.com/llvm/llvm-project/pull/150269

Both clang and gfortran support the -fopenmp-simd flag, which enables OpenMP support only for simd constructs, while disabling the rest of OpenMP.

Add a new SimdOnly flang OpenMP IR pass which rewrites generated OpenMP FIR to remove all constructs except for omp.simd constructs, and constructs nested under them.
With this approach, the logic required to make the flag work can be self-contained within the pass, as opposed to being scattered all over the lowering code.

The flag is expected to have no effect if -fopenmp is passed explicitly, and is only expected to remove OpenMP constructs, not things like OpenMP library functions calls. This matches the behaviour of other compilers.

>From 2e2b2cafe661604237f6a867459cfa1db8db5f6f Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski at arm.com>
Date: Mon, 30 Jun 2025 16:00:08 +0000
Subject: [PATCH] [flang][OpenMP] Add -f[no]-openmp-simd

Both clang and gfortran support the -fopenmp-simd flag, which enables
OpenMP support only for simd constructs, while disabling the rest of
OpenMP.

Add a new SimdOnly flang OpenMP IR pass which rewrites generated
OpenMP FIR to remove all constructs except for omp.simd constructs,
and constructs nested under them.
With this approach, the logic required to make the flag work can
be self-contained within the pass, as opposed to being scattered
all over the lowering code.

The flag is expected to have no effect if -fopenmp is passed
explicitly, and is only expected to remove OpenMP constructs, not
things like OpenMP library functions calls. This matches the
behaviour of other compilers.

Signed-off-by: Kajetan Puchalski <kajetan.puchalski at arm.com>
---
 clang/include/clang/Driver/Options.td         |  15 +-
 clang/lib/Driver/ToolChains/Flang.cpp         |   5 +
 .../include/flang/Optimizer/OpenMP/Passes.td  |   5 +
 .../flang/Optimizer/Passes/Pipelines.h        |   5 +-
 .../flang/Optimizer/Transforms/Utils.h        |   4 +
 flang/include/flang/Support/LangOptions.def   |   2 +
 flang/include/flang/Tools/CrossToolHelpers.h  |   1 +
 flang/lib/Frontend/CompilerInvocation.cpp     |  11 +-
 flang/lib/Frontend/FrontendActions.cpp        |  25 +-
 flang/lib/Lower/OpenMP/ClauseProcessor.h      |  11 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  54 +-
 flang/lib/Optimizer/OpenMP/CMakeLists.txt     |   1 +
 flang/lib/Optimizer/OpenMP/SimdOnly.cpp       | 360 ++++++++++
 flang/lib/Optimizer/Passes/Pipelines.cpp      |  14 +-
 .../Transforms/ControlFlowConverter.cpp       | 206 +++---
 flang/test/Driver/fopenmp-simd.f90            |  59 ++
 flang/test/Transforms/OpenMP/simd-only.mlir   | 622 ++++++++++++++++++
 flang/tools/bbc/bbc.cpp                       |   4 +-
 18 files changed, 1263 insertions(+), 141 deletions(-)
 create mode 100644 flang/lib/Optimizer/OpenMP/SimdOnly.cpp
 create mode 100644 flang/test/Driver/fopenmp-simd.f90
 create mode 100644 flang/test/Transforms/OpenMP/simd-only.mlir

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 916400efdb449..7a74dcffde4a9 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3706,14 +3706,19 @@ def fopenmp_relocatable_target : Flag<["-"], "fopenmp-relocatable-target">,
 def fnoopenmp_relocatable_target : Flag<["-"], "fnoopenmp-relocatable-target">,
   Group<f_Group>, Flags<[NoArgumentUnused, HelpHidden]>,
   Visibility<[ClangOption, CC1Option]>;
-def fopenmp_simd : Flag<["-"], "fopenmp-simd">, Group<f_Group>,
-  Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Emit OpenMP code only for SIMD-based constructs.">;
+def fopenmp_simd : Flag<["-"], "fopenmp-simd">,
+                   Group<f_Group>,
+                   Flags<[NoArgumentUnused]>,
+                   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
+                   HelpText<"Emit OpenMP code only for SIMD-based constructs.">;
 def fopenmp_enable_irbuilder : Flag<["-"], "fopenmp-enable-irbuilder">, Group<f_Group>,
   Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Use the experimental OpenMP-IR-Builder codegen path.">;
-def fno_openmp_simd : Flag<["-"], "fno-openmp-simd">, Group<f_Group>,
-  Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>;
+def fno_openmp_simd
+    : Flag<["-"], "fno-openmp-simd">,
+      Group<f_Group>,
+      Flags<[NoArgumentUnused]>,
+      Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>;
 def fopenmp_cuda_mode : Flag<["-"], "fopenmp-cuda-mode">, Group<f_Group>,
   Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>;
 def fno_openmp_cuda_mode : Flag<["-"], "fno-openmp-cuda-mode">, Group<f_Group>,
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 7ab41e9b85a04..547e3156f519a 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -937,6 +937,8 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
 
       if (Args.hasArg(options::OPT_fopenmp_force_usm))
         CmdArgs.push_back("-fopenmp-force-usm");
+      Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd,
+                      options::OPT_fno_openmp_simd);
 
       // FIXME: Clang supports a whole bunch more flags here.
       break;
@@ -952,6 +954,9 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
           << A->getSpelling() << A->getValue();
       break;
     }
+  } else {
+    Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd,
+                    options::OPT_fno_openmp_simd);
   }
 
   // Pass the path to compiler resource files.
diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td
index 704faf0ccd856..79c1a5cfd9aca 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -112,4 +112,9 @@ def GenericLoopConversionPass
   ];
 }
 
+def SimdOnlyPass : Pass<"omp-simd-only", "mlir::func::FuncOp"> {
+  let summary = "Filters out non-simd OpenMP constructs";
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
 #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index a3f59ee8dd013..fd8c43cc88a19 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -119,13 +119,16 @@ void registerDefaultInlinerPass(MLIRToLLVMPassPipelineConfig &config);
 void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
                                            MLIRToLLVMPassPipelineConfig &pc);
 
+/// Select which mode to enable OpenMP support in.
+enum class EnableOpenMP { None, Simd, Full };
+
 /// Create a pass pipeline for lowering from HLFIR to FIR
 ///
 /// \param pm - MLIR pass manager that will hold the pipeline definition
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
 void createHLFIRToFIRPassPipeline(
-    mlir::PassManager &pm, bool enableOpenMP,
+    mlir::PassManager &pm, EnableOpenMP enableOpenMP,
     llvm::OptimizationLevel optLevel = defaultOptLevel);
 
 struct OpenMPFIRPassPipelineOpts {
diff --git a/flang/include/flang/Optimizer/Transforms/Utils.h b/flang/include/flang/Optimizer/Transforms/Utils.h
index 49a616fb40fd5..307e6b59c57d4 100644
--- a/flang/include/flang/Optimizer/Transforms/Utils.h
+++ b/flang/include/flang/Optimizer/Transforms/Utils.h
@@ -33,6 +33,10 @@ void genMinMaxlocReductionLoop(fir::FirOpBuilder &builder, mlir::Value array,
                                mlir::Type maskElemType, mlir::Value resultArr,
                                bool maskMayBeLogicalScalar);
 
+std::pair<mlir::Block *, mlir::Block *>
+convertDoLoopToCFG(DoLoopOp loop, mlir::PatternRewriter &rewriter, bool setNSW,
+                   bool forceLoopToExecuteOnce);
+
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_TRANSFORMS_UTILS_H
diff --git a/flang/include/flang/Support/LangOptions.def b/flang/include/flang/Support/LangOptions.def
index d5bf7a2ecc036..ba72d7b4b7212 100644
--- a/flang/include/flang/Support/LangOptions.def
+++ b/flang/include/flang/Support/LangOptions.def
@@ -58,6 +58,8 @@ LANGOPT(OpenMPTeamSubscription, 1, 0)
 LANGOPT(OpenMPNoThreadState, 1, 0)
 /// Assume that no thread in a parallel region will encounter a parallel region
 LANGOPT(OpenMPNoNestedParallelism, 1, 0)
+/// Use SIMD only OpenMP support.
+LANGOPT(OpenMPSimd, 1, false)
 
 LANGOPT(VScaleMin, 32, 0)  ///< Minimum vscale range value
 LANGOPT(VScaleMax, 32, 0)  ///< Maximum vscale range value
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index df1da27058552..51958fa36c3ad 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -134,6 +134,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
                                       ///< functions.
   bool NSWOnLoopVarInc = true; ///< Add nsw flag to loop variable increments.
   bool EnableOpenMP = false; ///< Enable OpenMP lowering.
+  bool EnableOpenMPSimd = false; ///< Enable OpenMP simd-only mode.
   std::string InstrumentFunctionEntry =
       ""; ///< Name of the instrument-function that is called on each
           ///< function-entry
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index f55d866435997..80fd52b170f0c 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1152,8 +1152,15 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
                             clang::DiagnosticsEngine &diags) {
   llvm::opt::Arg *arg = args.getLastArg(clang::driver::options::OPT_fopenmp,
                                         clang::driver::options::OPT_fno_openmp);
-  if (!arg || arg->getOption().matches(clang::driver::options::OPT_fno_openmp))
-    return true;
+  if (!arg ||
+      arg->getOption().matches(clang::driver::options::OPT_fno_openmp)) {
+    bool isSimdSpecified = args.hasFlag(
+        clang::driver::options::OPT_fopenmp_simd,
+        clang::driver::options::OPT_fno_openmp_simd, /*Default=*/false);
+    if (!isSimdSpecified)
+      return true;
+    res.getLangOpts().OpenMPSimd = 1;
+  }
 
   unsigned numErrorsBefore = diags.getNumErrors();
   llvm::Triple t(res.getTargetOpts().triple);
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index b5f4f9421f633..0ac4c7094ec3b 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -297,6 +297,7 @@ bool CodeGenAction::beginSourceFileAction() {
   bool isOpenMPEnabled =
       ci.getInvocation().getFrontendOpts().features.IsEnabled(
           Fortran::common::LanguageFeature::OpenMP);
+  bool isOpenMPSimd = ci.getInvocation().getLangOpts().OpenMPSimd;
 
   fir::OpenMPFIRPassPipelineOpts opts;
 
@@ -328,12 +329,13 @@ bool CodeGenAction::beginSourceFileAction() {
     if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
             mlirModule->getOperation()))
       opts.isTargetDevice = offloadMod.getIsTargetDevice();
+  }
 
-    // WARNING: This pipeline must be run immediately after the lowering to
-    // ensure that the FIR is correct with respect to OpenMP operations/
-    // attributes.
+  // WARNING: This pipeline must be run immediately after the lowering to
+  // ensure that the FIR is correct with respect to OpenMP operations/
+  // attributes.
+  if (isOpenMPEnabled || isOpenMPSimd)
     fir::createOpenMPFIRPassPipeline(pm, opts);
-  }
 
   pm.enableVerifier(/*verifyPasses=*/true);
   pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
@@ -616,12 +618,14 @@ void CodeGenAction::lowerHLFIRToFIR() {
   pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());
   pm.enableVerifier(/*verifyPasses=*/true);
 
+  fir::EnableOpenMP enableOpenMP = fir::EnableOpenMP::None;
+  if (ci.getInvocation().getFrontendOpts().features.IsEnabled(
+          Fortran::common::LanguageFeature::OpenMP))
+    enableOpenMP = fir::EnableOpenMP::Full;
+  if (ci.getInvocation().getLangOpts().OpenMPSimd)
+    enableOpenMP = fir::EnableOpenMP::Simd;
   // Create the pass pipeline
-  fir::createHLFIRToFIRPassPipeline(
-      pm,
-      ci.getInvocation().getFrontendOpts().features.IsEnabled(
-          Fortran::common::LanguageFeature::OpenMP),
-      level);
+  fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP, level);
   (void)mlir::applyPassManagerCLOptions(pm);
 
   mlir::TimingScope timingScopeMLIRPasses = timingScopeRoot.nest(
@@ -747,6 +751,9 @@ void CodeGenAction::generateLLVMIR() {
           Fortran::common::LanguageFeature::OpenMP))
     config.EnableOpenMP = true;
 
+  if (ci.getInvocation().getLangOpts().OpenMPSimd)
+    config.EnableOpenMPSimd = true;
+
   if (ci.getInvocation().getLoweringOpts().getIntegerWrapAround())
     config.NSWOnLoopVarInc = false;
 
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index f8a1f7983b79b..3e81f759ae69c 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -208,11 +208,12 @@ void ClauseProcessor::processTODO(mlir::Location currentLocation,
     if (!x)
       return;
     unsigned version = semaCtx.langOptions().OpenMPVersion;
-    TODO(currentLocation,
-         "Unhandled clause " + llvm::omp::getOpenMPClauseName(id).upper() +
-             " in " +
-             llvm::omp::getOpenMPDirectiveName(directive, version).upper() +
-             " construct");
+    if (!semaCtx.langOptions().OpenMPSimd)
+      TODO(currentLocation,
+           "Unhandled clause " + llvm::omp::getOpenMPClauseName(id).upper() +
+               " in " +
+               llvm::omp::getOpenMPDirectiveName(directive, version).upper() +
+               " construct");
   };
 
   for (ClauseIterator it = clauses.begin(); it != clauses.end(); ++it)
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 4c2d7badef382..1647b8f516e46 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2259,7 +2259,8 @@ genOrderedOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
              semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
              mlir::Location loc, const ConstructQueue &queue,
              ConstructQueue::const_iterator item) {
-  TODO(loc, "OMPD_ordered");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(loc, "OMPD_ordered");
   return nullptr;
 }
 
@@ -2446,7 +2447,8 @@ genScopeOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
            mlir::Location loc, const ConstructQueue &queue,
            ConstructQueue::const_iterator item) {
-  TODO(loc, "Scope construct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(loc, "Scope construct");
   return nullptr;
 }
 
@@ -3238,7 +3240,8 @@ static mlir::omp::TaskloopOp genCompositeTaskloopSimd(
     lower::pft::Evaluation &eval, mlir::Location loc,
     const ConstructQueue &queue, ConstructQueue::const_iterator item) {
   assert(std::distance(item, queue.end()) == 2 && "Invalid leaf constructs");
-  TODO(loc, "Composite TASKLOOP SIMD");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(loc, "Composite TASKLOOP SIMD");
   return nullptr;
 }
 
@@ -3410,8 +3413,10 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     break;
   case llvm::omp::Directive::OMPD_tile: {
     unsigned version = semaCtx.langOptions().OpenMPVersion;
-    TODO(loc, "Unhandled loop directive (" +
-                  llvm::omp::getOpenMPDirectiveName(dir, version) + ")");
+    if (!semaCtx.langOptions().OpenMPSimd)
+      TODO(loc, "Unhandled loop directive (" +
+                    llvm::omp::getOpenMPDirectiveName(dir, version) + ")");
+    break;
   }
   case llvm::omp::Directive::OMPD_unroll:
     genUnrollOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item);
@@ -3446,35 +3451,40 @@ static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPDeclarativeAllocate &declarativeAllocate) {
-  TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPDeclarativeAssumes &assumesConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMP ASSUMES declaration");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMP ASSUMES declaration");
 }
 
 static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OmpDeclareVariantDirective &declareVariantDirective) {
-  TODO(converter.getCurrentLocation(), "OmpDeclareVariantDirective");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OmpDeclareVariantDirective");
 }
 
 static void genOMP(
     lower::AbstractConverter &converter, lower::SymMap &symTable,
     semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
     const parser::OpenMPDeclareReductionConstruct &declareReductionConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPDeclareReductionConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDeclareReductionConstruct");
 }
 
 static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPDeclareSimdConstruct &declareSimdConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPDeclareSimdConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDeclareSimdConstruct");
 }
 
 static void
@@ -3670,14 +3680,16 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   (void)objects;
   (void)clauses;
 
-  TODO(converter.getCurrentLocation(), "OpenMPDepobjConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDepobjConstruct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPInteropConstruct &interopConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPInteropConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPInteropConstruct");
 }
 
 static void
@@ -3693,7 +3705,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPAllocatorsConstruct &allocsConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct");
 }
 
 //===----------------------------------------------------------------------===//
@@ -3765,7 +3778,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
         !std::holds_alternative<clause::Detach>(clause.u)) {
       std::string name =
           parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(clause.id));
-      TODO(clauseLocation, name + " clause is not implemented yet");
+      if (!semaCtx.langOptions().OpenMPSimd)
+        TODO(clauseLocation, name + " clause is not implemented yet");
     }
   }
 
@@ -3785,7 +3799,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPAssumeConstruct &assumeConstruct) {
   mlir::Location clauseLocation = converter.genLocation(assumeConstruct.source);
-  TODO(clauseLocation, "OpenMP ASSUME construct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(clauseLocation, "OpenMP ASSUME construct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
@@ -3810,21 +3825,24 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPUtilityConstruct &) {
-  TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPDispatchConstruct &) {
-  TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPExecutableAllocate &execAllocConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate");
+  if (!semaCtx.langOptions().OpenMPSimd)
+    TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
index e31543328a9f9..3fb0bac05ce0d 100644
--- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt
+++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt
@@ -9,6 +9,7 @@ add_flang_library(FlangOpenMPTransforms
   MarkDeclareTarget.cpp
   LowerWorkshare.cpp
   LowerNontemporal.cpp
+  SimdOnly.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/OpenMP/SimdOnly.cpp b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
new file mode 100644
index 0000000000000..b4c97df767e65
--- /dev/null
+++ b/flang/lib/Optimizer/OpenMP/SimdOnly.cpp
@@ -0,0 +1,360 @@
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Transforms/Utils.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include <llvm/Support/Debug.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Support/LLVM.h>
+
+namespace flangomp {
+#define GEN_PASS_DEF_SIMDONLYPASS
+#include "flang/Optimizer/OpenMP/Passes.h.inc"
+} // namespace flangomp
+
+namespace {
+
+#define DEBUG_TYPE "omp-simd-only-pass"
+
+class SimdOnlyConversionPattern : public mlir::RewritePattern {
+public:
+  SimdOnlyConversionPattern(mlir::MLIRContext *ctx)
+      : mlir::RewritePattern(MatchAnyOpTypeTag{}, 1, ctx) {}
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::Operation *op,
+                  mlir::PatternRewriter &rewriter) const override {
+    if (op->getDialect()->getNamespace() !=
+        mlir::omp::OpenMPDialect::getDialectNamespace())
+      return rewriter.notifyMatchFailure(op, "Not an OpenMP op");
+
+    if (auto simdOp = mlir::dyn_cast<mlir::omp::SimdOp>(op)) {
+      // Remove the composite attr given that the op will no longer be composite
+      if (simdOp.isComposite()) {
+        simdOp.setComposite(false);
+        return mlir::success();
+      }
+
+      return rewriter.notifyMatchFailure(op, "Op is a plain SimdOp");
+    }
+
+    if (op->getParentOfType<mlir::omp::SimdOp>())
+      return rewriter.notifyMatchFailure(op, "Op is nested under a SimdOp");
+
+    if (!mlir::isa<mlir::func::FuncOp>(op->getParentOp()) &&
+        (mlir::isa<mlir::omp::TerminatorOp>(op) ||
+         mlir::isa<mlir::omp::YieldOp>(op)))
+      return rewriter.notifyMatchFailure(op,
+                                         "Non top-level yield or terminator");
+
+    // SectionOp overrides its BlockArgInterface based on the parent SectionsOp.
+    // We need to make sure we only rewrite omp.sections once all omp.section
+    // ops inside it have been rewritten, otherwise the individual omp.section
+    // ops will not be able to access their argument values.
+    if (auto sectionsOp = mlir::dyn_cast<mlir::omp::SectionsOp>(op)) {
+      for (auto &opInSections : sectionsOp.getRegion().getOps())
+        if (mlir::isa<mlir::omp::SectionOp>(opInSections))
+          return rewriter.notifyMatchFailure(
+              op, "SectionsOp still contains individual sections");
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "SimdOnlyPass matched OpenMP op:\n");
+    LLVM_DEBUG(op->dump());
+
+    // Erase ops that don't need any special handling
+    if (mlir::isa<mlir::omp::BarrierOp>(op) ||
+        mlir::isa<mlir::omp::FlushOp>(op) ||
+        mlir::isa<mlir::omp::TaskyieldOp>(op) ||
+        mlir::isa<mlir::omp::MapBoundsOp>(op) ||
+        mlir::isa<mlir::omp::TargetEnterDataOp>(op) ||
+        mlir::isa<mlir::omp::TargetExitDataOp>(op) ||
+        mlir::isa<mlir::omp::TargetUpdateOp>(op) ||
+        mlir::isa<mlir::omp::OrderedOp>(op) ||
+        mlir::isa<mlir::omp::CancelOp>(op) ||
+        mlir::isa<mlir::omp::CancellationPointOp>(op) ||
+        mlir::isa<mlir::omp::ScanOp>(op) ||
+        mlir::isa<mlir::omp::TaskwaitOp>(op)) {
+      rewriter.eraseOp(op);
+      return mlir::success();
+    }
+
+    fir::FirOpBuilder builder(rewriter, op);
+    mlir::Location loc = op->getLoc();
+
+    auto inlineSimpleOp = [&](mlir::Operation *ompOp) -> bool {
+      if (!ompOp)
+        return false;
+
+      llvm::SmallVector<std::pair<mlir::Value, mlir::BlockArgument>>
+          blockArgsPairs;
+      if (auto iface =
+              mlir::dyn_cast<mlir::omp::BlockArgOpenMPOpInterface>(op)) {
+        iface.getBlockArgsPairs(blockArgsPairs);
+        for (auto [value, argument] : blockArgsPairs)
+          rewriter.replaceAllUsesWith(argument, value);
+      }
+
+      if (ompOp->getRegion(0).getBlocks().size() == 1) {
+        auto &block = *ompOp->getRegion(0).getBlocks().begin();
+        // This block is about to be removed so any arguments should have been
+        // replaced by now.
+        block.eraseArguments(0, block.getNumArguments());
+        if (auto terminatorOp =
+                mlir::dyn_cast<mlir::omp::TerminatorOp>(block.back())) {
+          rewriter.eraseOp(terminatorOp);
+        }
+        rewriter.inlineBlockBefore(&block, op, {});
+      } else {
+        // When dealing with multi-block regions we need to fix up the control
+        // flow
+        auto *origBlock = ompOp->getBlock();
+        auto *newBlock = rewriter.splitBlock(origBlock, ompOp->getIterator());
+        auto *innerFrontBlock = &ompOp->getRegion(0).getBlocks().front();
+        builder.setInsertionPointToEnd(origBlock);
+        builder.create<mlir::cf::BranchOp>(loc, innerFrontBlock);
+        // We are no longer passing any arguments to the first block in the
+        // region, so this should be safe to erase.
+        innerFrontBlock->eraseArguments(0, innerFrontBlock->getNumArguments());
+
+        for (auto &innerBlock : ompOp->getRegion(0).getBlocks()) {
+          // Remove now-unused block arguments
+          for (auto arg : innerBlock.getArguments()) {
+            if (arg.getUses().empty())
+              innerBlock.eraseArgument(arg.getArgNumber());
+          }
+          if (auto terminatorOp =
+                  mlir::dyn_cast<mlir::omp::TerminatorOp>(innerBlock.back())) {
+            builder.setInsertionPointToEnd(&innerBlock);
+            builder.create<mlir::cf::BranchOp>(loc, newBlock);
+            rewriter.eraseOp(terminatorOp);
+          }
+        }
+
+        rewriter.inlineRegionBefore(ompOp->getRegion(0), newBlock);
+      }
+
+      rewriter.eraseOp(op);
+      return true;
+    };
+
+    if (auto ompOp = mlir::dyn_cast<mlir::omp::LoopNestOp>(op)) {
+      mlir::Type indexType = builder.getIndexType();
+      mlir::Type oldIndexType = ompOp.getIVs().begin()->getType();
+      builder.setInsertionPoint(op);
+      auto one = builder.create<mlir::arith::ConstantIndexOp>(loc, 1);
+
+      // Generate the new loop nest
+      mlir::Block *nestBody = nullptr;
+      fir::DoLoopOp outerLoop = nullptr;
+      llvm::SmallVector<mlir::Value> loopIndArgs;
+      for (auto extent : ompOp.getLoopUpperBounds()) {
+        auto ub = builder.createConvert(loc, indexType, extent);
+        auto doLoop = builder.create<fir::DoLoopOp>(loc, one, ub, one, false);
+        nestBody = doLoop.getBody();
+        builder.setInsertionPointToStart(nestBody);
+        // Convert the indices to the type used inside the loop if needed
+        if (oldIndexType != indexType) {
+          auto convertedIndVar = builder.createConvert(
+              loc, oldIndexType, doLoop.getInductionVar());
+          loopIndArgs.push_back(convertedIndVar);
+        } else {
+          loopIndArgs.push_back(doLoop.getInductionVar());
+        }
+        if (!outerLoop)
+          outerLoop = doLoop;
+      }
+
+      // Move the omp loop body into the new loop body
+      if (ompOp->getRegion(0).getBlocks().size() == 1) {
+        auto &block = *ompOp->getRegion(0).getBlocks().begin();
+        rewriter.mergeBlocks(&block, nestBody, loopIndArgs);
+
+        // Find the new loop block terminator and move it before the end of the
+        // block
+        for (auto &loopBodyOp : nestBody->getOperations()) {
+          if (auto resultOp = mlir::dyn_cast<fir::ResultOp>(loopBodyOp)) {
+            rewriter.moveOpBefore(resultOp.getOperation(), &nestBody->back());
+            break;
+          }
+        }
+
+        // Remove omp.yield at the end of the loop body
+        if (auto yieldOp = mlir::dyn_cast<mlir::omp::YieldOp>(nestBody->back()))
+          rewriter.eraseOp(yieldOp);
+        // DoLoopOp does not support multi-block regions, thus if we're dealing
+        // with multiple blocks we need to convert it into basic control-flow
+        // operations.
+      } else {
+        rewriter.inlineRegionBefore(ompOp->getRegion(0), nestBody);
+        auto indVarArg = outerLoop->getRegion(0).front().getArgument(0);
+        // fir::convertDoLoopToCFG expects the induction variable to be of type
+        // index while the OpenMP LoopNestOp can have indices of different
+        // types. We need to work around it.
+        if (indVarArg.getType() != indexType)
+          indVarArg.setType(indexType);
+
+        auto loopBlocks =
+            fir::convertDoLoopToCFG(outerLoop, rewriter, false, false);
+        auto *conditionalBlock = loopBlocks.first;
+        auto *firstBlock =
+            conditionalBlock->getNextNode(); // Start of the loop body
+        auto *lastBlock = loopBlocks.second; // Incrementing induction variables
+
+        // If the induction variable is used within the loop and was originally
+        // not of type index, then we need to add a convert to the original type
+        // and replace its uses inside the loop body.
+        if (oldIndexType != indexType) {
+          indVarArg = conditionalBlock->getArgument(0);
+          builder.setInsertionPointToStart(firstBlock);
+          auto convertedIndVar =
+              builder.createConvert(loc, oldIndexType, indVarArg);
+          rewriter.replaceUsesWithIf(
+              indVarArg, convertedIndVar, [&](auto &use) -> bool {
+                return use.getOwner() != convertedIndVar.getDefiningOp() &&
+                       use.getOwner()->getBlock() != lastBlock;
+              });
+        }
+
+        // There might be an unused convert and an unused argument to the block.
+        // If so, remove them.
+        if (lastBlock->front().getUses().empty())
+          lastBlock->front().erase();
+        for (auto arg : lastBlock->getArguments()) {
+          if (arg.getUses().empty())
+            lastBlock->eraseArgument(arg.getArgNumber());
+        }
+
+        // Any loop blocks that end in omp.yield should just branch to
+        // lastBlock.
+        for (auto *loopBlock = conditionalBlock; loopBlock != lastBlock;
+             loopBlock = loopBlock->getNextNode()) {
+          if (auto yieldOp =
+                  mlir::dyn_cast<mlir::omp::YieldOp>(loopBlock->back())) {
+            builder.setInsertionPointToEnd(loopBlock);
+            builder.create<mlir::cf::BranchOp>(loc, lastBlock);
+            rewriter.eraseOp(yieldOp);
+          }
+        }
+      }
+
+      rewriter.eraseOp(ompOp);
+      return mlir::success();
+    }
+
+    if (auto mapInfoOp = mlir::dyn_cast<mlir::omp::MapInfoOp>(op)) {
+      mapInfoOp.getResult().replaceAllUsesWith(mapInfoOp.getVarPtr());
+      rewriter.eraseOp(mapInfoOp);
+      return mlir::success();
+    }
+
+    if (auto atomicReadOp = mlir::dyn_cast<mlir::omp::AtomicReadOp>(op)) {
+      builder.setInsertionPoint(op);
+      auto loadOp = builder.create<fir::LoadOp>(loc, atomicReadOp.getX());
+      auto storeOp = builder.create<fir::StoreOp>(loc, loadOp.getResult(),
+                                                  atomicReadOp.getV());
+      rewriter.replaceOp(op, storeOp);
+      return mlir::success();
+    }
+
+    if (auto atomicWriteOp = mlir::dyn_cast<mlir::omp::AtomicWriteOp>(op)) {
+      auto storeOp = builder.create<fir::StoreOp>(loc, atomicWriteOp.getExpr(),
+                                                  atomicWriteOp.getX());
+      rewriter.replaceOp(op, storeOp);
+      return mlir::success();
+    }
+
+    if (auto atomicUpdateOp = mlir::dyn_cast<mlir::omp::AtomicUpdateOp>(op)) {
+      assert("one block in region" &&
+             atomicUpdateOp.getRegion().getBlocks().size() == 1);
+      auto &block = *atomicUpdateOp.getRegion().getBlocks().begin();
+      builder.setInsertionPointToStart(&block);
+
+      // Load the update `x` operand and replace its uses within the block
+      auto loadOp = builder.create<fir::LoadOp>(loc, atomicUpdateOp.getX());
+      rewriter.replaceUsesWithIf(
+          block.getArgument(0), loadOp.getResult(),
+          [&](auto &op) { return op.get().getParentBlock() == █ });
+
+      // Store the result back into `x` in line with omp.yield semantics for
+      // this op
+      auto yieldOp = mlir::cast<mlir::omp::YieldOp>(block.back());
+      assert("only one yield operand" && yieldOp->getNumOperands() == 1);
+      builder.setInsertionPointAfter(yieldOp);
+      builder.create<fir::StoreOp>(loc, yieldOp->getOperand(0),
+                                   atomicUpdateOp.getX());
+      rewriter.eraseOp(yieldOp);
+
+      // Inline the final block and remove the now-empty op
+      assert("only one block argument" && block.getNumArguments() == 1);
+      block.eraseArguments(0, block.getNumArguments());
+      rewriter.inlineBlockBefore(&block, op, {});
+      rewriter.eraseOp(op);
+      return mlir::success();
+    }
+
+    if (auto threadPrivateOp = mlir::dyn_cast<mlir::omp::ThreadprivateOp>(op)) {
+      threadPrivateOp.getTlsAddr().replaceAllUsesWith(
+          threadPrivateOp.getSymAddr());
+      rewriter.eraseOp(threadPrivateOp);
+      return mlir::success();
+    }
+
+    if (inlineSimpleOp(mlir::dyn_cast<mlir::omp::TeamsOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::ParallelOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::SingleOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::SectionOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::SectionsOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::WsloopOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::LoopOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TargetOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TargetDataOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::DistributeOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TaskOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::TaskloopOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::MasterOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::CriticalOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::OrderedRegionOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::AtomicCaptureOp>(op)) ||
+        inlineSimpleOp(mlir::dyn_cast<mlir::omp::MaskedOp>(op)))
+      return mlir::success();
+
+    op->emitOpError("OpenMP operation left unhandled after SimdOnly pass.");
+    return mlir::failure();
+  }
+};
+
+class SimdOnlyPass : public flangomp::impl::SimdOnlyPassBase<SimdOnlyPass> {
+
+public:
+  SimdOnlyPass() = default;
+
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+
+    if (func.isDeclaration())
+      return;
+
+    mlir::MLIRContext *context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.insert<SimdOnlyConversionPattern>(context);
+
+    mlir::GreedyRewriteConfig config;
+    // Prevent the pattern driver from merging blocks.
+    config.setRegionSimplificationLevel(
+        mlir::GreedySimplifyRegionLevel::Disabled);
+
+    if (mlir::failed(
+            mlir::applyPatternsGreedily(func, std::move(patterns), config))) {
+      mlir::emitError(func.getLoc(), "error in simd-only conversion pass");
+      signalPassFailure();
+    }
+  }
+};
+
+} // namespace
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index ca8e820608688..5a870928f8413 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -242,7 +242,8 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm,
 /// \param pm - MLIR pass manager that will hold the pipeline definition
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
-void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
+void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
+                                  EnableOpenMP enableOpenMP,
                                   llvm::OptimizationLevel optLevel) {
   if (optLevel.isOptimizingForSpeed()) {
     addCanonicalizerPassWithoutRegionSimplification(pm);
@@ -294,8 +295,10 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
     addNestedPassToAllTopLevelOperations<PassConstructor>(
         pm, hlfir::createInlineHLFIRAssign);
   pm.addPass(hlfir::createConvertHLFIRtoFIR());
-  if (enableOpenMP)
+  if (enableOpenMP != EnableOpenMP::None)
     pm.addPass(flangomp::createLowerWorkshare());
+  if (enableOpenMP == EnableOpenMP::Simd)
+    pm.addPass(flangomp::createSimdOnlyPass());
 }
 
 /// Create a pass pipeline for handling certain OpenMP transformations needed
@@ -396,7 +399,12 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
 void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
                                   MLIRToLLVMPassPipelineConfig &config,
                                   llvm::StringRef inputFilename) {
-  fir::createHLFIRToFIRPassPipeline(pm, config.EnableOpenMP, config.OptLevel);
+  fir::EnableOpenMP enableOpenMP = fir::EnableOpenMP::None;
+  if (config.EnableOpenMP)
+    enableOpenMP = fir::EnableOpenMP::Full;
+  if (config.EnableOpenMPSimd)
+    enableOpenMP = fir::EnableOpenMP::Simd;
+  fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP, config.OptLevel);
 
   // Add default optimizer pass pipeline.
   fir::createDefaultFIROptimizerPassPipeline(pm, config);
diff --git a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
index e466aed753e63..4bcf7d857c7b0 100644
--- a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
+++ b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
@@ -14,6 +14,7 @@
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Optimizer/Support/TypeCode.h"
 #include "flang/Optimizer/Transforms/Passes.h"
+#include "flang/Optimizer/Transforms/Utils.h"
 #include "flang/Runtime/derived-api.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
@@ -31,6 +32,113 @@ namespace fir {
 using namespace fir;
 using namespace mlir;
 
+// Extracted here for use in other passes
+
+/// Convert fir::DoLoopOp to control-flow operations
+std::pair<mlir::Block *, mlir::Block *>
+fir::convertDoLoopToCFG(DoLoopOp loop, mlir::PatternRewriter &rewriter,
+                        bool setNSW, bool forceLoopToExecuteOnce) {
+  auto loc = loop.getLoc();
+  mlir::arith::IntegerOverflowFlags flags{};
+  if (setNSW)
+    flags = bitEnumSet(flags, mlir::arith::IntegerOverflowFlags::nsw);
+  auto iofAttr =
+      mlir::arith::IntegerOverflowFlagsAttr::get(rewriter.getContext(), flags);
+
+  // Create the start and end blocks that will wrap the DoLoopOp with an
+  // initalizer and an end point
+  auto *initBlock = rewriter.getInsertionBlock();
+  auto initPos = rewriter.getInsertionPoint();
+  auto *endBlock = rewriter.splitBlock(initBlock, initPos);
+
+  // Split the first DoLoopOp block in two parts. The part before will be the
+  // conditional block since it already has the induction variable and
+  // loop-carried values as arguments.
+  auto *conditionalBlock = &loop.getRegion().front();
+  conditionalBlock->addArgument(rewriter.getIndexType(), loc);
+  auto *firstBlock =
+      rewriter.splitBlock(conditionalBlock, conditionalBlock->begin());
+  auto *lastBlock = &loop.getRegion().back();
+
+  // Move the blocks from the DoLoopOp between initBlock and endBlock
+  rewriter.inlineRegionBefore(loop.getRegion(), endBlock);
+
+  // Get loop values from the DoLoopOp
+  auto low = loop.getLowerBound();
+  auto high = loop.getUpperBound();
+  assert(low && high && "must be a Value");
+  auto step = loop.getStep();
+
+  // Initalization block
+  rewriter.setInsertionPointToEnd(initBlock);
+  auto diff = mlir::arith::SubIOp::create(rewriter, loc, high, low);
+  auto distance = mlir::arith::AddIOp::create(rewriter, loc, diff, step);
+  mlir::Value iters =
+      mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
+
+  if (forceLoopToExecuteOnce) {
+    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+    auto cond = mlir::arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sle, iters, zero);
+    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+    iters = mlir::arith::SelectOp::create(rewriter, loc, cond, one, iters);
+  }
+
+  llvm::SmallVector<mlir::Value> loopOperands;
+  loopOperands.push_back(low);
+  auto operands = loop.getIterOperands();
+  loopOperands.append(operands.begin(), operands.end());
+  loopOperands.push_back(iters);
+
+  mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopOperands);
+
+  // Last loop block
+  auto *terminator = lastBlock->getTerminator();
+  rewriter.setInsertionPointToEnd(lastBlock);
+  auto iv = conditionalBlock->getArgument(0);
+  mlir::Value steppedIndex =
+      mlir::arith::AddIOp::create(rewriter, loc, iv, step, iofAttr);
+  assert(steppedIndex && "must be a Value");
+  auto lastArg = conditionalBlock->getNumArguments() - 1;
+  auto itersLeft = conditionalBlock->getArgument(lastArg);
+  auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
+  mlir::Value itersMinusOne =
+      mlir::arith::SubIOp::create(rewriter, loc, itersLeft, one);
+
+  llvm::SmallVector<mlir::Value> loopCarried;
+  loopCarried.push_back(steppedIndex);
+  auto begin = loop.getFinalValue() ? std::next(terminator->operand_begin())
+                                    : terminator->operand_begin();
+  loopCarried.append(begin, terminator->operand_end());
+  loopCarried.push_back(itersMinusOne);
+  auto backEdge =
+      mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopCarried);
+  rewriter.eraseOp(terminator);
+
+  // Copy loop annotations from the do loop to the loop back edge.
+  if (auto ann = loop.getLoopAnnotation())
+    backEdge->setAttr("loop_annotation", *ann);
+
+  // Conditional block
+  rewriter.setInsertionPointToEnd(conditionalBlock);
+  auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
+  auto comparison = mlir::arith::CmpIOp::create(
+      rewriter, loc, arith::CmpIPredicate::sgt, itersLeft, zero);
+
+  mlir::cf::CondBranchOp::create(rewriter, loc, comparison, firstBlock,
+                                 llvm::ArrayRef<mlir::Value>(), endBlock,
+                                 llvm::ArrayRef<mlir::Value>());
+
+  // The result of the loop operation is the values of the condition block
+  // arguments except the induction variable on the last iteration.
+  auto args = loop.getFinalValue()
+                  ? conditionalBlock->getArguments()
+                  : conditionalBlock->getArguments().drop_front();
+  rewriter.replaceOp(loop, args.drop_back());
+
+  return std::make_pair(conditionalBlock, lastBlock);
+}
+
 namespace {
 
 // Conversion of fir control ops to more primitive control-flow.
@@ -50,103 +158,7 @@ class CfgLoopConv : public mlir::OpRewritePattern<fir::DoLoopOp> {
   llvm::LogicalResult
   matchAndRewrite(DoLoopOp loop,
                   mlir::PatternRewriter &rewriter) const override {
-    auto loc = loop.getLoc();
-    mlir::arith::IntegerOverflowFlags flags{};
-    if (setNSW)
-      flags = bitEnumSet(flags, mlir::arith::IntegerOverflowFlags::nsw);
-    auto iofAttr = mlir::arith::IntegerOverflowFlagsAttr::get(
-        rewriter.getContext(), flags);
-
-    // Create the start and end blocks that will wrap the DoLoopOp with an
-    // initalizer and an end point
-    auto *initBlock = rewriter.getInsertionBlock();
-    auto initPos = rewriter.getInsertionPoint();
-    auto *endBlock = rewriter.splitBlock(initBlock, initPos);
-
-    // Split the first DoLoopOp block in two parts. The part before will be the
-    // conditional block since it already has the induction variable and
-    // loop-carried values as arguments.
-    auto *conditionalBlock = &loop.getRegion().front();
-    conditionalBlock->addArgument(rewriter.getIndexType(), loc);
-    auto *firstBlock =
-        rewriter.splitBlock(conditionalBlock, conditionalBlock->begin());
-    auto *lastBlock = &loop.getRegion().back();
-
-    // Move the blocks from the DoLoopOp between initBlock and endBlock
-    rewriter.inlineRegionBefore(loop.getRegion(), endBlock);
-
-    // Get loop values from the DoLoopOp
-    auto low = loop.getLowerBound();
-    auto high = loop.getUpperBound();
-    assert(low && high && "must be a Value");
-    auto step = loop.getStep();
-
-    // Initalization block
-    rewriter.setInsertionPointToEnd(initBlock);
-    auto diff = mlir::arith::SubIOp::create(rewriter, loc, high, low);
-    auto distance = mlir::arith::AddIOp::create(rewriter, loc, diff, step);
-    mlir::Value iters =
-        mlir::arith::DivSIOp::create(rewriter, loc, distance, step);
-
-    if (forceLoopToExecuteOnce) {
-      auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
-      auto cond = mlir::arith::CmpIOp::create(
-          rewriter, loc, arith::CmpIPredicate::sle, iters, zero);
-      auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
-      iters = mlir::arith::SelectOp::create(rewriter, loc, cond, one, iters);
-    }
-
-    llvm::SmallVector<mlir::Value> loopOperands;
-    loopOperands.push_back(low);
-    auto operands = loop.getIterOperands();
-    loopOperands.append(operands.begin(), operands.end());
-    loopOperands.push_back(iters);
-
-    mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock, loopOperands);
-
-    // Last loop block
-    auto *terminator = lastBlock->getTerminator();
-    rewriter.setInsertionPointToEnd(lastBlock);
-    auto iv = conditionalBlock->getArgument(0);
-    mlir::Value steppedIndex =
-        mlir::arith::AddIOp::create(rewriter, loc, iv, step, iofAttr);
-    assert(steppedIndex && "must be a Value");
-    auto lastArg = conditionalBlock->getNumArguments() - 1;
-    auto itersLeft = conditionalBlock->getArgument(lastArg);
-    auto one = mlir::arith::ConstantIndexOp::create(rewriter, loc, 1);
-    mlir::Value itersMinusOne =
-        mlir::arith::SubIOp::create(rewriter, loc, itersLeft, one);
-
-    llvm::SmallVector<mlir::Value> loopCarried;
-    loopCarried.push_back(steppedIndex);
-    auto begin = loop.getFinalValue() ? std::next(terminator->operand_begin())
-                                      : terminator->operand_begin();
-    loopCarried.append(begin, terminator->operand_end());
-    loopCarried.push_back(itersMinusOne);
-    auto backEdge = mlir::cf::BranchOp::create(rewriter, loc, conditionalBlock,
-                                               loopCarried);
-    rewriter.eraseOp(terminator);
-
-    // Copy loop annotations from the do loop to the loop back edge.
-    if (auto ann = loop.getLoopAnnotation())
-      backEdge->setAttr("loop_annotation", *ann);
-
-    // Conditional block
-    rewriter.setInsertionPointToEnd(conditionalBlock);
-    auto zero = mlir::arith::ConstantIndexOp::create(rewriter, loc, 0);
-    auto comparison = mlir::arith::CmpIOp::create(
-        rewriter, loc, arith::CmpIPredicate::sgt, itersLeft, zero);
-
-    mlir::cf::CondBranchOp::create(rewriter, loc, comparison, firstBlock,
-                                   llvm::ArrayRef<mlir::Value>(), endBlock,
-                                   llvm::ArrayRef<mlir::Value>());
-
-    // The result of the loop operation is the values of the condition block
-    // arguments except the induction variable on the last iteration.
-    auto args = loop.getFinalValue()
-                    ? conditionalBlock->getArguments()
-                    : conditionalBlock->getArguments().drop_front();
-    rewriter.replaceOp(loop, args.drop_back());
+    convertDoLoopToCFG(loop, rewriter, setNSW, forceLoopToExecuteOnce);
     return success();
   }
 
diff --git a/flang/test/Driver/fopenmp-simd.f90 b/flang/test/Driver/fopenmp-simd.f90
new file mode 100644
index 0000000000000..b25adee2779ee
--- /dev/null
+++ b/flang/test/Driver/fopenmp-simd.f90
@@ -0,0 +1,59 @@
+! RUN: %flang -target x86_64-linux-gnu -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+! RUN: %flang -target x86_64-darwin -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+! RUN: %flang -target x86_64-freebsd -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+! RUN: %flang -target x86_64-windows-gnu -fopenmp-simd %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-OPENMP-SIMD-FLAG --check-prefix=CHECK-NO-LD-ANY
+
+! CHECK-OPENMP-SIMD-FLAG: "-fopenmp-simd"
+! CHECK-NO-LD-ANY-NOT: "-l{{(omp|gomp|iomp5)}}"
+
+! -fopenmp-simd enables openmp support only for simd constructs
+! RUN: %flang_fc1 -fopenmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP-SIMD %s
+! RUN: %flang_fc1 -fno-openmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-NO-OMP-SIMD %s
+! RUN: %flang_fc1 -fopenmp-simd -fno-openmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-NO-OMP-SIMD %s
+! RUN: %flang_fc1 -fno-openmp-simd -fopenmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP-SIMD %s
+! -fopenmp-simd should have no effect if -fopenmp is already set
+! RUN: %flang_fc1 -fopenmp %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP %s
+! RUN: %flang_fc1 -fopenmp -fopenmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP %s
+! RUN: %flang_fc1 -fopenmp -fno-openmp-simd %s -emit-fir -o - | FileCheck --check-prefix=CHECK-OMP %s
+
+subroutine main
+  ! CHECK-OMP-SIMD-NOT: omp.parallel
+  ! CHECK-OMP-SIMD-NOT: omp.wsloop
+  ! CHECK-OMP-SIMD-NOT: omp.loop_nest
+  ! CHECK-OMP-SIMD: fir.do_loop
+  ! CHECK-NO-OMP-SIMD-NOT: omp.parallel
+  ! CHECK-NO-OMP-SIMD-NOT: omp.wsloop
+  ! CHECK-NO-OMP-SIMD-NOT: omp.loop_nest
+  ! CHECK-NO-OMP-SIMD: fir.do_loop
+  ! CHECK-OMP: omp.parallel
+  ! CHECK-OMP: omp.wsloop
+  ! CHECK-OMP: omp.loop_nest
+  ! CHECK-OMP-NOT: fir.do_loop
+  !$omp parallel do
+  do i = 1, 10
+    print *, "test"
+  end do
+  ! CHECK-NO-OMP-SIMD-NOT: omp.yield
+  ! CHECK-NO-OMP-SIMD-NOT: omp.terminator
+  ! CHECK-OMP-SIMD-NOT: omp.yield
+  ! CHECK-OMP-SIMD-NOT: omp.terminator
+  ! CHECK-OMP: omp.yield
+  ! CHECK-OMP: omp.terminator
+  !$omp end parallel do
+
+  ! CHECK-OMP-SIMD: omp.simd
+  ! CHECK-NO-OMP-SIMD-NOT: omp.simd
+  ! CHECK-OMP: omp.simd
+  !$omp simd
+  ! CHECK-OMP-SIMD: omp.loop_nest
+  ! CHECK-NO-OMP-SIMD-NOT: omp.loop_nest
+  ! CHECK-NO-OMP-SIMD: fir.do_loop
+  ! CHECK-OMP: omp.loop_nest
+  ! CHECK-OMP-NOT: fir.do_loop
+  do i = 1, 10
+    print *, "test"
+  ! CHECK-OMP-SIMD: omp.yield
+  ! CHECK-NO-OMP-SIMD-NOT: omp.yield
+  ! CHECK-OMP: omp.yield
+  end do
+end subroutine
diff --git a/flang/test/Transforms/OpenMP/simd-only.mlir b/flang/test/Transforms/OpenMP/simd-only.mlir
new file mode 100644
index 0000000000000..c3efce13c4414
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/simd-only.mlir
@@ -0,0 +1,622 @@
+// RUN: fir-opt --split-input-file --omp-simd-only %s | FileCheck %s
+
+// Check that simd operations are not removed and rewritten, but all the other OpenMP ops are.
+
+// CHECK-LABEL: func.func @simd
+omp.private {type = private} @_QFEi_private_i32 : i32
+func.func @simd(%arg0: i32, %arg1: !fir.ref<i32>, %arg2: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK: omp.simd private
+  omp.simd private(@_QFEi_private_i32 %arg2 -> %arg3 : !fir.ref<i32>) {
+    // CHECK: omp.loop_nest
+    omp.loop_nest (%arg4) : i32 = (%c1_i32) to (%c100000_i32) inclusive step (%c1_i32) {
+      // CHECK: fir.store
+      fir.store %arg0 to %arg1 : !fir.ref<i32>
+      // CHECK: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @simd_composite
+func.func @simd_composite(%arg0: i32, %arg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK-NOT: omp.wsloop
+    omp.wsloop {
+      // CHECK: omp.simd
+      omp.simd {
+        // CHECK: omp.loop_nest
+        omp.loop_nest (%arg3) : i32 = (%c1_i32) to (%c100000_i32) inclusive step (%c1_i32) {
+          // CHECK: fir.store
+          fir.store %arg0 to %arg1 : !fir.ref<i32>
+          // CHECK: omp.yield
+          omp.yield
+        }
+      // CHECK-NOT: {omp.composite}
+      } {omp.composite}
+    } {omp.composite}
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @parallel
+omp.private {type = private} @_QFEi_private_i32 : i32
+func.func @parallel(%arg0: i32, %arg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c100000_i32 = arith.constant 100000 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel private(@_QFEi_private_i32 %arg1 -> %arg3 : !fir.ref<i32>) {
+    // CHECK: fir.convert
+    %15 = fir.convert %c1_i32 : (i32) -> index
+    // CHECK: fir.convert
+    %16 = fir.convert %c100000_i32 : (i32) -> index
+    // CHECK: fir.do_loop
+    %18:2 = fir.do_loop %arg4 = %15 to %16 step %c1 iter_args(%arg2 = %arg0) -> (index, i32) {
+      // CHECK: fir.store
+      fir.store %arg0 to %arg1 : !fir.ref<i32>
+      // CHECK-NOT: omp.barrier
+      omp.barrier
+      fir.result %arg4, %arg2 : index, i32
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+    }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @do
+func.func @do(%arg5: i32, %arg6: !fir.ref<i32>) {
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  // CHECK: %[[C100:.*]] = fir.convert %c100_i32 : (i32) -> index
+  %c100_i32 = arith.constant 100 : i32
+  // CHECK-NOT: omp.wsloop
+  omp.wsloop {
+    // CHECK-NOT: omp.loop_nest
+    // CHECK: fir.do_loop %[[IVAR:.*]] = %[[C1]] to %[[C100]] step %[[C1]]
+    omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32) {
+      // CHECK: fir.store
+      fir.store %arg5 to %arg6 : !fir.ref<i32>
+      // CHECK-NOT: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @do_nested
+func.func @do_nested(%arg5: i32, %arg6: !fir.ref<i32>) {
+  // CHECK: %[[C1:.*]] = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c100_i32 = arith.constant 100 : i32
+  %c200_i32 = arith.constant 200 : i32
+  // CHECK-NOT: omp.wsloop
+  omp.wsloop {
+    // CHECK: %[[C200:.*]] = fir.convert %c200_i32 : (i32) -> index
+    // CHECK-NOT: omp.loop_nest
+    // CHECK: fir.do_loop %[[IVAR_1:.*]] = %[[C1]] to %[[C200]] step %[[C1]]
+    // CHECK: %[[C100:.*]] = fir.convert %c100_i32 : (i32) -> index
+    // CHECK: fir.do_loop %[[IVAR_2:.*]] = %[[C1]] to %[[C100]] step %[[C1]]
+    omp.loop_nest (%arg2, %arg3) : i32 = (%c1_i32, %c1_i32) to (%c200_i32, %c100_i32) inclusive step (%c1_i32, %c1_i32) {
+      // CHECK: fir.store
+      fir.store %arg5 to %arg6 : !fir.ref<i32>
+      // CHECK-NOT: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @single
+func.func @single(%arg0: i32, %arg1: !fir.ref<i32>) {
+  // CHECK-NOT: omp.single
+  omp.single {
+    // CHECK: fir.store
+    fir.store %arg0 to %arg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @target_map(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @target_map(%arg5: i32, %arg6: !fir.ref<i32>) {
+  // CHECK-NOT: omp.map.info
+  %3 = omp.map.info var_ptr(%arg6 : !fir.ref<i32>, i32) map_clauses(implicit) capture(ByCopy) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target
+  omp.target map_entries(%3 -> %arg0 : !fir.ref<i32>) {
+    // CHECK: arith.constant
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
+    fir.store %c1_i32 to %arg0 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @task(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+omp.private {type = private} @_QFEi_private_i32 : i32
+func.func @task(%arg5: i32, %arg6: !fir.ref<i32>) {
+  // CHECK-NOT: omp.task
+  omp.task private(@_QFEi_private_i32 %arg6 -> %arg2 : !fir.ref<i32>) {
+    // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
+    fir.store %arg5 to %arg2 : !fir.ref<i32>
+    // CHECK-NOT: omp.flush
+    omp.flush
+    // CHECK-NOT: omp.taskyield
+    omp.taskyield
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @teams
+func.func @teams(%arg0: i32, %arg1: !fir.ref<i32>) {
+  // CHECK-NOT: omp.teams
+  omp.teams {
+    // CHECK: fir.store
+    fir.store %arg0 to %arg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @distribute
+func.func @distribute(%arg0: i32, %arg1: i32, %arg2: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  // CHECK-NOT: omp.teams
+  omp.teams {
+    // CHECK-NOT: omp.distribute
+    omp.distribute {
+      // CHECK-NOT: omp.loop_nest
+      // CHECK: fir.do_loop
+      omp.loop_nest (%arg5) : i32 = (%arg0) to (%arg1) inclusive step (%c1_i32) {
+        // CHECK: fir.store
+        fir.store %arg0 to %arg2 : !fir.ref<i32>
+        // CHECK-NOT: omp.yield
+        omp.yield
+      }
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @threadprivate(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @threadprivate(%arg0: i32, %arg1: !fir.ref<i32>) {
+  // CHECK-NOT: omp.threadprivate
+  %1 = omp.threadprivate %arg1 : !fir.ref<i32> -> !fir.ref<i32>
+  // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
+  fir.store %arg0 to %1 : !fir.ref<i32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @taskloop(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @taskloop(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c2_i32 = arith.constant 2 : i32
+  %c10_i32 = arith.constant 10 : i32
+  // CHECK-NOT: omp.taskloop
+  omp.taskloop grainsize(%c2_i32: i32) {
+    // CHECK-NOT: omp.loop_nest
+    // CHECK: fir.do_loop
+    omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+      // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
+      fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+      // CHECK-NOT: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @target_update_enter_data_map_info(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @target_update_enter_data_map_info(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  // CHECK-NOT: omp.map.bounds
+  %1 = omp.map.bounds lower_bound(%c1 : index) upper_bound(%c1 : index) extent(%c1 : index) stride(%c1 : index) start_idx(%c1 : index)
+  // CHECK-NOT: omp.map.info
+  %13 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(to) capture(ByRef) bounds(%1) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target_enter_data
+  omp.target_enter_data map_entries(%13 : !fir.ref<i32>)
+  // CHECK-NOT: omp.target
+  omp.target map_entries(%13 -> %arg3 : !fir.ref<i32>) {
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
+    fir.store %c1_i32 to %arg3 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  // CHECK-NOT: omp.map.info
+  %18 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(from) capture(ByRef) bounds(%1) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target_update
+  omp.target_update map_entries(%18 : !fir.ref<i32>)
+  // CHECK-NOT: omp.target_exit_data
+  omp.target_exit_data map_entries(%18 : !fir.ref<i32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @target_data(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @target_data(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1 = arith.constant 1 : index
+  // CHECK-NOT: omp.map.bounds
+  %3 = omp.map.bounds lower_bound(%c1 : index) upper_bound(%c1 : index) extent(%c1 : index) stride(%c1 : index) start_idx(%c1 : index)
+  // CHECK-NOT: omp.map.info
+  %4 = omp.map.info var_ptr(%funcArg1 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) bounds(%3) -> !fir.ref<i32>
+  // CHECK-NOT: omp.target_data
+  omp.target_data map_entries(%4 : !fir.ref<i32>) {
+    %c1_i32 = arith.constant 1 : i32
+    // CHECK: fir.store %c1_i32 to %[[ARG_1]]
+    fir.store %c1_i32 to %4 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @sections(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
+func.func @sections(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
+  // CHECK-NOT: omp.sections
+  omp.sections {
+    // CHECK-NOT: omp.section
+    omp.section {
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+      // CHECK-NOT: omp.terminator
+      omp.terminator
+    }
+    // CHECK-NOT: omp.section
+    omp.section {
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
+      // CHECK-NOT: omp.terminator
+      omp.terminator
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+omp.declare_reduction @add_reduction_i32 : i32 init {
+^bb0(%arg0: i32):
+  %c0_i32 = arith.constant 0 : i32
+  omp.yield(%c0_i32 : i32)
+} combiner {
+^bb0(%arg0: i32, %arg1: i32):
+  %0 = arith.addi %arg0, %arg1 : i32
+  omp.yield(%0 : i32)
+}
+// CHECK-LABEL: func.func @reduction_scan(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @reduction_scan(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c8_i32 = arith.constant 8 : i32
+  // CHECK-NOT: omp.wsloop
+  omp.wsloop reduction(mod: inscan, @add_reduction_i32 %funcArg1 -> %arg3 : !fir.ref<i32>) {
+    // CHECK-NOT: omp.loop_nest
+    // CHECK: fir.do_loop
+    omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c8_i32) inclusive step (%c1_i32) {
+      // CHECK: fir.declare %[[ARG_1]]
+      %1 = fir.declare %arg3 {uniq_name = "a"} : (!fir.ref<i32>) -> !fir.ref<i32>
+      // CHECK-NOT: omp.scan
+      omp.scan inclusive(%1 : !fir.ref<i32>)
+      // CHECK: fir.store
+      fir.store %funcArg0 to %1 : !fir.ref<i32>
+      // CHECK-NOT: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @ordered(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>
+func.func @ordered(%funcArg0: i32, %funcArg1: !fir.ref<i32>) {
+  %c1_i32 = arith.constant 1 : i32
+  %c10_i32 = arith.constant 10 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK-NOT: omp.wsloop
+    omp.wsloop ordered(0) {
+      // CHECK-NOT: omp.loop_nest
+      // CHECK: fir.do_loop
+      omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+        // CHECK-NOT: omp.ordered.region
+        omp.ordered.region {
+          // CHECK: fir.store
+          fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+          // CHECK-NOT: omp.terminator
+          omp.terminator
+        }
+        // CHECK-NOT: omp.yield
+        omp.yield
+      }
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @master(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
+func.func @master(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK: fir.store
+    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.master
+    omp.master {
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
+      // CHECK-NOT: omp.terminator
+      omp.terminator
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @masked(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
+func.func @masked(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK: fir.store
+    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.masked
+    omp.masked {
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
+      // CHECK-NOT: omp.terminator
+      omp.terminator
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @critical(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>
+omp.critical.declare @mylock
+func.func @critical(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>) {
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK: fir.store
+    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.critical
+    omp.critical(@mylock) {
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
+      // CHECK-NOT: omp.terminator
+      omp.terminator
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @cancel(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i1
+func.func @cancel(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>, %funcArg3: i1) {
+  %c1_i32 = arith.constant 1 : i32
+  %c10_i32 = arith.constant 10 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK-NOT: omp.wsloop
+    omp.wsloop {
+      // CHECK-NOT: omp.loop_nest
+      // CHECK: fir.do_loop
+      omp.loop_nest (%arg1) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+        // CHECK: fir.store
+        fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+        // CHECK-NOT: fir.if
+        fir.if %funcArg3 {
+          // CHECK-NOT: omp.cancel
+          omp.cancel cancellation_construct_type(loop)
+        }
+        // CHECK-NOT: omp.cancellation_point
+        omp.cancellation_point cancellation_construct_type(loop)
+        // CHECK: fir.store
+        fir.store %funcArg0 to %funcArg2 : !fir.ref<i32>
+        // CHECK-NOT: omp.yield
+        omp.yield
+      }
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @atomic(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_2:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i32
+func.func @atomic(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %funcArg2: !fir.ref<i32>, %funcArg3: i32) {
+  %c1_i32 = arith.constant 1 : i32
+  %5 = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
+  // CHECK: %[[VAL_0:.*]] = fir.declare
+  %6 = fir.declare %5 {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK-NOT: omp.atomic.write
+    // CHECK: fir.store %[[ARG_0]] to %[[ARG_1]]
+    omp.atomic.write %funcArg1 = %funcArg0 : !fir.ref<i32>, i32
+    // CHECK-NOT: omp.atomic.read
+    // CHECK: %[[VAL_1:.*]] = fir.load %[[ARG_1]]
+    // CHECK-NEXT: fir.store %[[VAL_1]] to %[[ARG_2]]
+    omp.atomic.read %funcArg2 = %funcArg1 : !fir.ref<i32>, !fir.ref<i32>, i32
+    // CHECK-NOT: omp.atomic.update
+    // CHECK: fir.load %[[VAL_0]]
+    // CHECK-NEXT: %[[ADD_VAL:.*]] = arith.addi
+    // CHECK-NOT: omp.yield
+    // CHECK-NEXT: fir.store %[[ADD_VAL]] to %[[VAL_0]]
+    omp.atomic.update %6 : !fir.ref<i32> {
+    ^bb0(%arg3: i32):
+      %88 = arith.addi %arg3, %c1_i32 : i32
+      omp.yield(%88 : i32)
+    }
+    // CHECK-NOT: omp.atomic.read
+    // CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]]
+    // CHECK-NEXT: fir.store %[[VAL_2]] to %[[ARG_1]]
+    omp.atomic.read %funcArg1 = %6 : !fir.ref<i32>, !fir.ref<i32>, i32
+    // CHECK-NOT: omp.atomic.capture
+    omp.atomic.capture {
+      // CHECK-NOT: omp.atomic.read
+      // CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_0]]
+      // CHECK-NEXT: fir.store %[[VAL_3]] to %[[ARG_2]]
+      omp.atomic.read %funcArg2 = %6 : !fir.ref<i32>, !fir.ref<i32>, i32
+      // CHECK-NOT: omp.atomic.update
+      // CHECK: fir.load %[[VAL_0]]
+      // CHECK-NEXT: %[[ADD_VAL_2:.*]] = arith.addi
+      // CHECK-NOT: omp.yield
+      // CHECK-NEXT: fir.store %[[ADD_VAL_2]] to %[[VAL_0]]
+      omp.atomic.update %6 : !fir.ref<i32> {
+      ^bb0(%arg3: i32):
+        %88 = arith.addi %arg3, %c1_i32 : i32
+        omp.yield(%88 : i32)
+      }
+    }
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @multi_block(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i1
+func.func @multi_block(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %6: i1) {
+  %false = arith.constant false
+  %c0_i32 = arith.constant 0 : i32
+  // CHECK-NOT: omp.parallel
+  omp.parallel {
+    // CHECK: cf.cond_br %[[ARG_3]], ^[[BB1:.*]], ^[[BB2:.*]]
+    cf.cond_br %6, ^bb1, ^bb2
+  // CHECK: ^[[BB1]]
+  ^bb1:  // pred: ^bb0
+    // CHECK: fir.call
+    fir.call @_FortranAStopStatement(%c0_i32, %false, %false) fastmath<contract> : (i32, i1, i1) -> ()
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  // CHECK: ^[[BB2]]
+  ^bb2:  // pred: ^bb0
+    // CHECK: fir.store
+    fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+    // CHECK-NOT: omp.terminator
+    omp.terminator
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @do_multi_block(
+// CHECK-SAME: %[[ARG_0:.*]]: i32, %[[ARG_1:.*]]: !fir.ref<i32>, %[[ARG_3:.*]]: i1
+func.func @do_multi_block(%funcArg0: i32, %funcArg1: !fir.ref<i32>, %6: i1) {
+  %false = arith.constant false
+  %c1_i32 = arith.constant 1 : i32
+  %c100_i32 = arith.constant 100 : i32
+  // CHECK-NOT: omp.wsloop
+  omp.wsloop {
+    // CHECK-NOT: omp.loop_nest
+    // CHECK: cf.br ^[[CBB:.*]](
+    // CHECK: ^[[CBB]]
+    // CHECK: %[[CMP_VAL:.*]] = arith.cmpi
+    // CHECK: cf.cond_br %[[CMP_VAL]], ^[[FBB:.*]], ^[[LBB:.*]]
+    omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%c100_i32) inclusive step (%c1_i32) {
+    // CHECK: ^[[FBB]]
+      // CHECK: fir.store
+      fir.store %funcArg0 to %funcArg1 : !fir.ref<i32>
+      // CHECK: cf.br ^[[BBB:.*]]
+      cf.br ^bb1
+    // CHECK: ^[[BBB]]
+    ^bb1:  // pred: ^bb0
+      // CHECK: fir.store
+      fir.store %c1_i32 to %funcArg1 : !fir.ref<i32>
+      // CHECK: cf.cond_br
+      cf.cond_br %6, ^bb2, ^bb3
+    // CHECK: ^[[SBB:.*]]
+    ^bb2:  // pred: ^bb1
+      // CHECK: fir.call
+      fir.call @_FortranAStopStatement(%c1_i32, %false, %false) fastmath<contract> : (i32, i1, i1) -> ()
+      // CHECK-NOT: omp.yield
+      omp.yield
+      // CHECK: cf.br ^[[LBB:.*]]
+    // CHECK: ^[[OBB:.*]]
+      // CHECK: cf.br ^[[LBB]]
+    // CHECK: ^[[LBB]]
+      // CHECK: arith.subi
+      // CHECK: cf.br ^[[CBB]]
+    // CHECK: ^[[EBB:.*]]
+    ^bb3:  // pred: ^bb1
+      // CHECK-NOT: omp.yield
+      omp.yield
+    }
+  }
+  return
+}
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index edfc878d17524..82dff2653ad09 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -520,7 +520,9 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
 
     if (emitFIR && useHLFIR) {
       // lower HLFIR to FIR
-      fir::createHLFIRToFIRPassPipeline(pm, enableOpenMP,
+      fir::EnableOpenMP enableOmp =
+          enableOpenMP ? fir::EnableOpenMP::Full : fir::EnableOpenMP::None;
+      fir::createHLFIRToFIRPassPipeline(pm, enableOmp,
                                         llvm::OptimizationLevel::O2);
       if (mlir::failed(pm.run(mlirModule))) {
         llvm::errs() << "FATAL: lowering from HLFIR to FIR failed";