[Mlir-commits] [mlir] 8647e4c - [mlir] support translating OpenMP loops with reductions

Thu Sep 2 06:38:29 PDT 2021

Author: Alex Zinenko
Date: 2021-09-02T15:38:20+02:00
New Revision: 8647e4c3a0a3d87af48863f381420041bc813692

URL: https://github.com/llvm/llvm-project/commit/8647e4c3a0a3d87af48863f381420041bc813692
DIFF: https://github.com/llvm/llvm-project/commit/8647e4c3a0a3d87af48863f381420041bc813692.diff

LOG: [mlir] support translating OpenMP loops with reductions

Use the recently introduced OpenMPIRBuilder facility to transate OpenMP
workshare loops with reductions to LLVM IR calling OpenMP runtime. Most of the
heavy lifting is done at the OpenMPIRBuilder. When other OpenMP dialect
constructs grow support for reductions, the translation can be updated to
operate on, e.g., an operation interface for all reduction containers instead
of workshare loops specifically. Designing such a generic translation for the
single operation that currently supports reductions is premature since we don't
know how the reduction modeling itself will be generalized.

Reviewed By: kiranchandramohan

Differential Revision: https://reviews.llvm.org/D107343

Added: 
    mlir/test/Target/LLVMIR/openmp-reduction.mlir

Modified: 
    mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
    mlir/lib/Target/LLVMIR/CMakeLists.txt
    mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
    mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
    mlir/test/Target/LLVMIR/llvmir-invalid.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 6df772d8e68b1..c95488d14a49b 100644

--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -83,6 +83,9 @@ class ModuleTranslation {
     return valueMapping.lookup(value);
   }
 
+  /// Looks up remapped a list of remapped values.
+  SmallVector<llvm::Value *> lookupValues(ValueRange values);
+
   /// Stores the mapping between an MLIR block and LLVM IR basic block.
   void mapBlock(Block *mlir, llvm::BasicBlock *llvm) {
     auto result = blockMapping.try_emplace(mlir, llvm);
@@ -110,6 +113,10 @@ class ModuleTranslation {
     return branchMapping.lookup(op);
   }
 
+  /// Removes the mapping for blocks contained in the region and values defined
+  /// in these blocks.
+  void forgetMapping(Region &region);
+
   /// Returns the LLVM metadata corresponding to a reference to an mlir LLVM
   /// dialect access group operation.
   llvm::MDNode *getAccessGroup(Operation &opInst,
@@ -142,9 +149,6 @@ class ModuleTranslation {
   /// Converts the type from MLIR LLVM dialect to LLVM.
   llvm::Type *convertType(Type type);
 
-  /// Looks up remapped a list of remapped values.
-  SmallVector<llvm::Value *, 8> lookupValues(ValueRange values);
-
   /// Returns the MLIR context of the module being translated.
   MLIRContext &getContext() { return *mlirModule->getContext(); }
 
@@ -217,7 +221,7 @@ class ModuleTranslation {
   /// translated makes the frame available when translating ops within that
   /// region.
   template <typename T, typename... Args>
-  void stackPush(Args &&... args) {
+  void stackPush(Args &&...args) {
     static_assert(
         std::is_base_of<StackFrame, T>::value,
         "can only push instances of StackFrame on ModuleTranslation stack");

diff  --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt
index 4c5d857d92d38..0214616d6459e 100644
--- a/mlir/lib/Target/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt
@@ -28,8 +28,6 @@ add_mlir_translation_library(MLIRTargetLLVMIRExport
 
   LINK_LIBS PUBLIC
   MLIRLLVMIR
-  MLIROpenACC
-  MLIROpenMP
   MLIRLLVMIRTransforms
   MLIRTranslation
   )

diff  --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 29dbe7a6d33b4..0fd81e85a692b 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 #include "mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
@@ -33,6 +34,19 @@ class OpenMPAllocaStackFrame
       : allocaInsertPoint(allocaIP) {}
   llvm::OpenMPIRBuilder::InsertPointTy allocaInsertPoint;
 };
+
+/// ModuleTranslation stack frame containing the partial mapping between MLIR
+/// values and their LLVM IR equivalents.
+class OpenMPVarMappingStackFrame
+    : public LLVM::ModuleTranslation::StackFrameBase<
+          OpenMPVarMappingStackFrame> {
+public:
+  explicit OpenMPVarMappingStackFrame(
+      const DenseMap<Value, llvm::Value *> &mapping)
+      : mapping(mapping) {}
+
+  DenseMap<Value, llvm::Value *> mapping;
+};
 } // namespace
 
 /// Find the insertion point for allocas given the current insertion point for
@@ -62,22 +76,66 @@ findAllocaInsertPoint(llvm::IRBuilderBase &builder,
 /// Converts the given region that appears within an OpenMP dialect operation to
 /// LLVM IR, creating a branch from the `sourceBlock` to the entry block of the
 /// region, and a branch from any block with an successor-less OpenMP terminator
-/// to `continuationBlock`.
-static void convertOmpOpRegions(Region &region, StringRef blockName,
-                                llvm::BasicBlock &sourceBlock,
-                                llvm::BasicBlock &continuationBlock,
-                                llvm::IRBuilderBase &builder,
-                                LLVM::ModuleTranslation &moduleTranslation,
-                                LogicalResult &bodyGenStatus) {
+/// to `continuationBlock`. Populates `continuationBlockPHIs` with the PHI nodes
+/// of the continuation block if provided.
+static void convertOmpOpRegions(
+    Region &region, StringRef blockName, llvm::BasicBlock &sourceBlock,
+    llvm::BasicBlock &continuationBlock, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation, LogicalResult &bodyGenStatus,
+    SmallVectorImpl<llvm::PHINode *> *continuationBlockPHIs = nullptr) {
   llvm::LLVMContext &llvmContext = builder.getContext();
   for (Block &bb : region) {
     llvm::BasicBlock *llvmBB = llvm::BasicBlock::Create(
-        llvmContext, blockName, builder.GetInsertBlock()->getParent());
+        llvmContext, blockName, builder.GetInsertBlock()->getParent(),
+        builder.GetInsertBlock()->getNextNode());
     moduleTranslation.mapBlock(&bb, llvmBB);
   }
 
   llvm::Instruction *sourceTerminator = sourceBlock.getTerminator();
 
+  // Terminators (namely YieldOp) may be forwarding values to the region that
+  // need to be available in the continuation block. Collect the types of these
+  // operands in preparation of creating PHI nodes.
+  SmallVector<llvm::Type *> continuationBlockPHITypes;
+  bool operandsProcessed = false;
+  unsigned numYields = 0;
+  for (Block &bb : region.getBlocks()) {
+    if (omp::YieldOp yield = dyn_cast<omp::YieldOp>(bb.getTerminator())) {
+      if (!operandsProcessed) {
+        for (unsigned i = 0, e = yield->getNumOperands(); i < e; ++i) {
+          continuationBlockPHITypes.push_back(
+              moduleTranslation.convertType(yield->getOperand(i).getType()));
+        }
+        operandsProcessed = true;
+      } else {
+        assert(continuationBlockPHITypes.size() == yield->getNumOperands() &&
+               "mismatching number of values yielded from the region");
+        for (unsigned i = 0, e = yield->getNumOperands(); i < e; ++i) {
+          llvm::Type *operandType =
+              moduleTranslation.convertType(yield->getOperand(i).getType());
+          (void)operandType;
+          assert(continuationBlockPHITypes[i] == operandType &&
+                 "values of mismatching types yielded from the region");
+        }
+      }
+      numYields++;
+    }
+  }
+
+  // Insert PHI nodes in the continuation block for any values forwarded by the
+  // terminators in this region.
+  if (!continuationBlockPHITypes.empty())
+    assert(
+        continuationBlockPHIs &&
+        "expected continuation block PHIs if converted regions yield values");
+  if (continuationBlockPHIs) {
+    llvm::IRBuilderBase::InsertPointGuard guard(builder);
+    continuationBlockPHIs->reserve(continuationBlockPHITypes.size());
+    builder.SetInsertPoint(&continuationBlock, continuationBlock.begin());
+    for (llvm::Type *ty : continuationBlockPHITypes)
+      continuationBlockPHIs->push_back(builder.CreatePHI(ty, numYields));
+  }
+
   // Convert blocks one by one in topological order to ensure
   // defs are converted before uses.
   SetVector<Block *> blocks =
@@ -108,12 +166,24 @@ static void convertOmpOpRegions(Region &region, StringRef blockName,
     // ModuleTranslation class to set up the correct insertion point. This is
     // also consistent with MLIR's idiom of handling special region terminators
     // in the same code that handles the region-owning operation.
-    if (isa<omp::TerminatorOp, omp::YieldOp>(bb->getTerminator()))
+    Operation *terminator = bb->getTerminator();
+    if (isa<omp::TerminatorOp, omp::YieldOp>(terminator)) {
       builder.CreateBr(&continuationBlock);
+
+      for (unsigned i = 0, e = terminator->getNumOperands(); i < e; ++i)
+        (*continuationBlockPHIs)[i]->addIncoming(
+            moduleTranslation.lookupValue(terminator->getOperand(i)), llvmBB);
+    }
   }
-  // Finally, after all blocks have been traversed and values mapped,
-  // connect the PHI nodes to the results of preceding blocks.
+  // After all blocks have been traversed and values mapped, connect the PHI
+  // nodes to the results of preceding blocks.
   LLVM::detail::connectPHINodes(region, moduleTranslation);
+
+  // Remove the blocks and values defined in this region from the mapping since
+  // they are not visible outside of this region. This allows the same region to
+  // be converted several times, that is cloned, without clashes, and slightly
+  // speeds up the lookups.
+  moduleTranslation.forgetMapping(region);
 }
 
 /// Converts the OpenMP parallel operation to LLVM IR.
@@ -243,6 +313,167 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// Returns a reduction declaration that corresponds to the given reduction
+/// operation in the given container. Currently only supports reductions inside
+/// WsLoopOp but can be easily extended.
+static omp::ReductionDeclareOp findReductionDecl(omp::WsLoopOp container,
+                                                 omp::ReductionOp reduction) {
+  SymbolRefAttr reductionSymbol;
+  for (unsigned i = 0, e = container.getNumReductionVars(); i < e; ++i) {
+    if (container.reduction_vars()[i] != reduction.accumulator())
+      continue;
+    reductionSymbol = (*container.reductions())[i].cast<SymbolRefAttr>();
+    break;
+  }
+  assert(reductionSymbol &&
+         "reduction operation must be associated with a declaration");
+
+  return SymbolTable::lookupNearestSymbolFrom<omp::ReductionDeclareOp>(
+      container, reductionSymbol);
+}
+
+/// Populates `reductions` with reduction declarations used in the given loop.
+static void
+collectReductionDecls(omp::WsLoopOp loop,
+                      SmallVectorImpl<omp::ReductionDeclareOp> &reductions) {
+  Optional<ArrayAttr> attr = loop.reductions();
+  if (!attr)
+    return;
+
+  reductions.reserve(reductions.size() + loop.getNumReductionVars());
+  for (auto symbolRef : attr->getAsRange<SymbolRefAttr>()) {
+    reductions.push_back(
+        SymbolTable::lookupNearestSymbolFrom<omp::ReductionDeclareOp>(
+            loop, symbolRef));
+  }
+}
+
+/// Translates the blocks contained in the given region and appends them to at
+/// the current insertion point of `builder`. The operations of the entry block
+/// are appended to the current insertion block, which is not expected to have a
+/// terminator. If set, `continuationBlockArgs` is populated with translated
+/// values that correspond to the values omp.yield'ed from the region.
+static LogicalResult inlineConvertOmpRegions(
+    Region &region, StringRef blockName, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation,
+    SmallVectorImpl<llvm::Value *> *continuationBlockArgs = nullptr) {
+  if (region.empty())
+    return success();
+
+  // Special case for single-block regions that don't create additional blocks:
+  // insert operations without creating additional blocks.
+  if (llvm::hasSingleElement(region)) {
+    moduleTranslation.mapBlock(&region.front(), builder.GetInsertBlock());
+    if (failed(moduleTranslation.convertBlock(
+            region.front(), /*ignoreArguments=*/true, builder)))
+      return failure();
+
+    // The continuation arguments are simply the translated terminator operands.
+    if (continuationBlockArgs)
+      llvm::append_range(
+          *continuationBlockArgs,
+          moduleTranslation.lookupValues(region.front().back().getOperands()));
+
+    // Drop the mapping that is no longer necessary so that the same region can
+    // be processed multiple times.
+    moduleTranslation.forgetMapping(region);
+    return success();
+  }
+
+  // Create the continuation block manually instead of calling splitBlock
+  // because the current insertion block may not have a terminator.
+  llvm::BasicBlock *continuationBlock =
+      llvm::BasicBlock::Create(builder.getContext(), blockName + ".cont",
+                               builder.GetInsertBlock()->getParent(),
+                               builder.GetInsertBlock()->getNextNode());
+  builder.CreateBr(continuationBlock);
+
+  LogicalResult bodyGenStatus = success();
+  SmallVector<llvm::PHINode *> phis;
+  convertOmpOpRegions(region, blockName, *builder.GetInsertBlock(),
+                      *continuationBlock, builder, moduleTranslation,
+                      bodyGenStatus, &phis);
+  if (failed(bodyGenStatus))
+    return failure();
+  if (continuationBlockArgs)
+    llvm::append_range(*continuationBlockArgs, phis);
+  builder.SetInsertPoint(continuationBlock,
+                         continuationBlock->getFirstInsertionPt());
+  return success();
+}
+
+namespace {
+/// Owning equivalents of OpenMPIRBuilder::(Atomic)ReductionGen that are used to
+/// store lambdas with capture.
+using OwningReductionGen = std::function<llvm::OpenMPIRBuilder::InsertPointTy(
+    llvm::OpenMPIRBuilder::InsertPointTy, llvm::Value *, llvm::Value *,
+    llvm::Value *&)>;
+using OwningAtomicReductionGen =
+    std::function<llvm::OpenMPIRBuilder::InsertPointTy(
+        llvm::OpenMPIRBuilder::InsertPointTy, llvm::Value *, llvm::Value *)>;
+} // namespace
+
+/// Create an OpenMPIRBuilder-compatible reduction generator for the given
+/// reduction declaration. The generator uses `builder` but ignores its
+/// insertion point.
+static OwningReductionGen
+makeReductionGen(omp::ReductionDeclareOp decl, llvm::IRBuilderBase &builder,
+                 LLVM::ModuleTranslation &moduleTranslation) {
+  // The lambda is mutable because we need access to non-const methods of decl
+  // (which aren't actually mutating it), and we must capture decl by-value to
+  // avoid the dangling reference after the parent function returns.
+  OwningReductionGen gen =
+      [&, decl](llvm::OpenMPIRBuilder::InsertPointTy insertPoint,
+                llvm::Value *lhs, llvm::Value *rhs,
+                llvm::Value *&result) mutable {
+        Region &reductionRegion = decl.reductionRegion();
+        moduleTranslation.mapValue(reductionRegion.front().getArgument(0), lhs);
+        moduleTranslation.mapValue(reductionRegion.front().getArgument(1), rhs);
+        builder.restoreIP(insertPoint);
+        SmallVector<llvm::Value *> phis;
+        if (failed(inlineConvertOmpRegions(reductionRegion,
+                                           "omp.reduction.nonatomic.body",
+                                           builder, moduleTranslation, &phis)))
+          return llvm::OpenMPIRBuilder::InsertPointTy();
+        assert(phis.size() == 1);
+        result = phis[0];
+        return builder.saveIP();
+      };
+  return gen;
+}
+
+/// Create an OpenMPIRBuilder-compatible atomic reduction generator for the
+/// given reduction declaration. The generator uses `builder` but ignores its
+/// insertion point. Returns null if there is no atomic region available in the
+/// reduction declaration.
+static OwningAtomicReductionGen
+makeAtomicReductionGen(omp::ReductionDeclareOp decl,
+                       llvm::IRBuilderBase &builder,
+                       LLVM::ModuleTranslation &moduleTranslation) {
+  if (decl.atomicReductionRegion().empty())
+    return OwningAtomicReductionGen();
+
+  // The lambda is mutable because we need access to non-const methods of decl
+  // (which aren't actually mutating it), and we must capture decl by-value to
+  // avoid the dangling reference after the parent function returns.
+  OwningAtomicReductionGen atomicGen =
+      [&, decl](llvm::OpenMPIRBuilder::InsertPointTy insertPoint,
+                llvm::Value *lhs, llvm::Value *rhs) mutable {
+        Region &atomicRegion = decl.atomicReductionRegion();
+        moduleTranslation.mapValue(atomicRegion.front().getArgument(0), lhs);
+        moduleTranslation.mapValue(atomicRegion.front().getArgument(1), rhs);
+        builder.restoreIP(insertPoint);
+        SmallVector<llvm::Value *> phis;
+        if (failed(inlineConvertOmpRegions(atomicRegion,
+                                           "omp.reduction.atomic.body", builder,
+                                           moduleTranslation, &phis)))
+          return llvm::OpenMPIRBuilder::InsertPointTy();
+        assert(phis.empty());
+        return builder.saveIP();
+      };
+  return atomicGen;
+}
+
 /// Converts an OpenMP workshare loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -258,6 +489,57 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
     schedule =
         *omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue());
 
+  // Find the loop configuration.
+  llvm::Value *step = moduleTranslation.lookupValue(loop.step()[0]);
+  llvm::Type *ivType = step->getType();
+  llvm::Value *chunk =
+      loop.schedule_chunk_var()
+          ? moduleTranslation.lookupValue(loop.schedule_chunk_var())
+          : llvm::ConstantInt::get(ivType, 1);
+
+  SmallVector<omp::ReductionDeclareOp> reductionDecls;
+  collectReductionDecls(loop, reductionDecls);
+  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
+      findAllocaInsertPoint(builder, moduleTranslation);
+
+  // Allocate space for privatized reduction variables.
+  SmallVector<llvm::Value *> privateReductionVariables;
+  DenseMap<Value, llvm::Value *> reductionVariableMap;
+  unsigned numReductions = loop.getNumReductionVars();
+  privateReductionVariables.reserve(numReductions);
+  if (numReductions != 0) {
+    llvm::IRBuilderBase::InsertPointGuard guard(builder);
+    builder.restoreIP(allocaIP);
+    for (unsigned i = 0; i < numReductions; ++i) {
+      auto reductionType =
+          loop.reduction_vars()[i].getType().cast<LLVM::LLVMPointerType>();
+      llvm::Value *var = builder.CreateAlloca(
+          moduleTranslation.convertType(reductionType.getElementType()));
+      privateReductionVariables.push_back(var);
+      reductionVariableMap.try_emplace(loop.reduction_vars()[i], var);
+    }
+  }
+
+  // Store the mapping between reduction variables and their private copies on
+  // ModuleTranslation stack. It can be then recovered when translating
+  // omp.reduce operations in a separate call.
+  LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
+      moduleTranslation, reductionVariableMap);
+
+  // Before the loop, store the initial values of reductions into reduction
+  // variables. Although this could be done after allocas, we don't want to mess
+  // up with the alloca insertion point.
+  for (unsigned i = 0; i < numReductions; ++i) {
+    SmallVector<llvm::Value *> phis;
+    if (failed(inlineConvertOmpRegions(reductionDecls[i].initializerRegion(),
+                                       "omp.reduction.neutral", builder,
+                                       moduleTranslation, &phis)))
+      return failure();
+    assert(phis.size() == 1 && "expected one value to be yielded from the "
+                               "reduction neutral element declaration region");
+    builder.CreateStore(phis[0], privateReductionVariables[i]);
+  }
+
   // Set up the source location value for OpenMP runtime.
   llvm::DISubprogram *subprogram =
       builder.GetInsertBlock()->getParent()->getSubprogram();
@@ -329,14 +611,7 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::CanonicalLoopInfo *loopInfo =
       ompBuilder->collapseLoops(diLoc, loopInfos, {});
 
-  // Find the loop configuration.
-  llvm::Type *ivType = loopInfo->getIndVar()->getType();
-  llvm::Value *chunk =
-      loop.schedule_chunk_var()
-          ? moduleTranslation.lookupValue(loop.schedule_chunk_var())
-          : llvm::ConstantInt::get(ivType, 1);
-  llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
-      findAllocaInsertPoint(builder, moduleTranslation);
+  allocaIP = findAllocaInsertPoint(builder, moduleTranslation);
   if (schedule == omp::ClauseScheduleKind::Static) {
     ompBuilder->applyStaticWorkshareLoop(ompLoc.DL, loopInfo, allocaIP,
                                          !loop.nowait(), chunk);
@@ -369,6 +644,98 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   // potential further loop transformations. Use the insertion point stored
   // before collapsing loops instead.
   builder.restoreIP(afterIP);
+
+  // Process the reductions if required.
+  if (numReductions == 0)
+    return success();
+
+  // Create the reduction generators. We need to own them here because
+  // ReductionInfo only accepts references to the generators.
+  SmallVector<OwningReductionGen> owningReductionGens;
+  SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens;
+  for (unsigned i = 0; i < numReductions; ++i) {
+    owningReductionGens.push_back(
+        makeReductionGen(reductionDecls[i], builder, moduleTranslation));
+    owningAtomicReductionGens.push_back(
+        makeAtomicReductionGen(reductionDecls[i], builder, moduleTranslation));
+  }
+
+  // Collect the reduction information.
+  SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;
+  reductionInfos.reserve(numReductions);
+  for (unsigned i = 0; i < numReductions; ++i) {
+    llvm::OpenMPIRBuilder::AtomicReductionGenTy atomicGen = nullptr;
+    if (owningAtomicReductionGens[i])
+      atomicGen = owningAtomicReductionGens[i];
+    reductionInfos.push_back(
+        {moduleTranslation.lookupValue(loop.reduction_vars()[i]),
+         privateReductionVariables[i], owningReductionGens[i], atomicGen});
+  }
+
+  // The call to createReductions below expects the block to have a
+  // terminator. Create an unreachable instruction to serve as terminator
+  // and remove it later.
+  llvm::UnreachableInst *tempTerminator = builder.CreateUnreachable();
+  builder.SetInsertPoint(tempTerminator);
+  llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
+      ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos,
+                                   loop.nowait());
+  if (!contInsertPoint.getBlock())
+    return loop->emitOpError() << "failed to convert reductions";
+  auto nextInsertionPoint =
+      ompBuilder->createBarrier(contInsertPoint, llvm::omp::OMPD_for);
+  tempTerminator->eraseFromParent();
+  builder.restoreIP(nextInsertionPoint);
+
+  return success();
+}
+
+/// Converts an OpenMP reduction operation using OpenMPIRBuilder. Expects the
+/// mapping between reduction variables and their private equivalents to have
+/// been stored on the ModuleTranslation stack. Currently only supports
+/// reduction within WsLoopOp, but can be easily extended.
+static LogicalResult
+convertOmpReductionOp(omp::ReductionOp reductionOp,
+                      llvm::IRBuilderBase &builder,
+                      LLVM::ModuleTranslation &moduleTranslation) {
+  // Find the declaration that corresponds to the reduction op.
+  auto reductionContainer = reductionOp->getParentOfType<omp::WsLoopOp>();
+  omp::ReductionDeclareOp declaration =
+      findReductionDecl(reductionContainer, reductionOp);
+  assert(declaration && "could not find reduction declaration");
+
+  // Retrieve the mapping between reduction variables and their private
+  // equivalents.
+  const DenseMap<Value, llvm::Value *> *reductionVariableMap = nullptr;
+  moduleTranslation.stackWalk<OpenMPVarMappingStackFrame>(
+      [&](const OpenMPVarMappingStackFrame &frame) {
+        reductionVariableMap = &frame.mapping;
+        return WalkResult::interrupt();
+      });
+  assert(reductionVariableMap && "couldn't find private reduction variables");
+
+  // Translate the reduction operation by emitting the body of the corresponding
+  // reduction declaration.
+  Region &reductionRegion = declaration.reductionRegion();
+  llvm::Value *privateReductionVar =
+      reductionVariableMap->lookup(reductionOp.accumulator());
+  llvm::Value *reductionVal = builder.CreateLoad(
+      moduleTranslation.convertType(reductionOp.operand().getType()),
+      privateReductionVar);
+
+  moduleTranslation.mapValue(reductionRegion.front().getArgument(0),
+                             reductionVal);
+  moduleTranslation.mapValue(
+      reductionRegion.front().getArgument(1),
+      moduleTranslation.lookupValue(reductionOp.operand()));
+
+  SmallVector<llvm::Value *> phis;
+  if (failed(inlineConvertOmpRegions(reductionRegion, "omp.reduction.body",
+                                     builder, moduleTranslation, &phis)))
+    return failure();
+  assert(phis.size() == 1 && "expected one value to be yielded from "
+                             "the reduction body declaration region");
+  builder.CreateStore(phis[0], privateReductionVar);
   return success();
 }
 
@@ -426,6 +793,9 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       .Case([&](omp::ParallelOp) {
         return convertOmpParallel(*op, builder, moduleTranslation);
       })
+      .Case([&](omp::ReductionOp reductionOp) {
+        return convertOmpReductionOp(reductionOp, builder, moduleTranslation);
+      })
       .Case([&](omp::MasterOp) {
         return convertOmpMaster(*op, builder, moduleTranslation);
       })
@@ -435,13 +805,14 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       .Case([&](omp::WsLoopOp) {
         return convertOmpWsLoop(*op, builder, moduleTranslation);
       })
-      .Case<omp::YieldOp, omp::TerminatorOp>([](auto op) {
-        // `yield` and `terminator` can be just omitted. The block structure was
-        // created in the function that handles their parent operation.
-        assert(op->getNumOperands() == 0 &&
-               "unexpected OpenMP terminator with operands");
-        return success();
-      })
+      .Case<omp::YieldOp, omp::TerminatorOp, omp::ReductionDeclareOp>(
+          [](auto op) {
+            // `yield` and `terminator` can be just omitted. The block structure
+            // was created in the region that handles their parent operation.
+            // `reduction.declare` will be used by reductions and is not
+            // converted directly, skip it.
+            return success();
+          })
       .Default([&](Operation *inst) {
         return inst->emitError("unsupported OpenMP operation: ")
                << inst->getName();

diff  --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 51133f63c722f..64c7970cc71d9 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -224,6 +224,31 @@ ModuleTranslation::~ModuleTranslation() {
     ompBuilder->finalize();
 }
 
+void ModuleTranslation::forgetMapping(Region &region) {
+  SmallVector<Region *> toProcess;
+  toProcess.push_back(&region);
+  while (!toProcess.empty()) {
+    Region *current = toProcess.pop_back_val();
+    for (Block &block : *current) {
+      blockMapping.erase(&block);
+      for (Value arg : block.getArguments())
+        valueMapping.erase(arg);
+      for (Operation &op : block) {
+        for (Value value : op.getResults())
+          valueMapping.erase(value);
+        if (op.hasSuccessors())
+          branchMapping.erase(&op);
+        if (isa<LLVM::GlobalOp>(op))
+          globalsMapping.erase(&op);
+        accessGroupMetadataMapping.erase(&op);
+        llvm::append_range(
+            toProcess,
+            llvm::map_range(op.getRegions(), [](Region &r) { return &r; }));
+      }
+    }
+  }
+}
+
 /// Get the SSA value passed to the current block from the terminator operation
 /// of its predecessor.
 static Value getPHISourceValue(Block *current, Block *pred,
@@ -686,15 +711,6 @@ LogicalResult ModuleTranslation::convertDialectAttributes(Operation *op) {
   return success();
 }
 
-/// Check whether the module contains only supported ops directly in its body.
-static LogicalResult checkSupportedModuleOps(Operation *m) {
-  for (Operation &o : getModuleBody(m).getOperations())
-    if (!isa<LLVM::LLVMFuncOp, LLVM::GlobalOp, LLVM::MetadataOp>(&o) &&
-        !o.hasTrait<OpTrait::IsTerminator>())
-      return o.emitOpError("unsupported module-level operation");
-  return success();
-}
-
 LogicalResult ModuleTranslation::convertFunctionSignatures() {
   // Declare all functions first because there may be function calls that form a
   // call graph with cycles, or global initializers that reference functions.
@@ -850,10 +866,9 @@ llvm::Type *ModuleTranslation::convertType(Type type) {
   return typeTranslator.translateType(type);
 }
 
-/// A helper to look up remapped operands in the value remapping table.`
-SmallVector<llvm::Value *, 8>
-ModuleTranslation::lookupValues(ValueRange values) {
-  SmallVector<llvm::Value *, 8> remapped;
+/// A helper to look up remapped operands in the value remapping table.
+SmallVector<llvm::Value *> ModuleTranslation::lookupValues(ValueRange values) {
+  SmallVector<llvm::Value *> remapped;
   remapped.reserve(values.size());
   for (Value v : values)
     remapped.push_back(lookupValue(v));
@@ -900,8 +915,6 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
                               StringRef name) {
   if (!satisfiesLLVMModule(module))
     return nullptr;
-  if (failed(checkSupportedModuleOps(module)))
-    return nullptr;
   std::unique_ptr<llvm::Module> llvmModule =
       prepareLLVMModule(module, llvmContext, name);
 
@@ -918,6 +931,17 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
     return nullptr;
   if (failed(translator.convertFunctions()))
     return nullptr;
+
+  // Convert other top-level operations if possible.
+  llvm::IRBuilder<> llvmBuilder(llvmContext);
+  for (Operation &o : getModuleBody(module).getOperations()) {
+    if (!isa<LLVM::LLVMFuncOp, LLVM::GlobalOp, LLVM::MetadataOp>(&o) &&
+        !o.hasTrait<OpTrait::IsTerminator>() &&
+        failed(translator.convertOperation(o, llvmBuilder))) {
+      return nullptr;
+    }
+  }
+
   if (llvm::verifyModule(*translator.llvmModule, &llvm::errs()))
     return nullptr;
 

diff  --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index b6870f4055bda..cc5dc86e302e8 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
 
-// expected-error @+1 {{unsupported module-level operation}}
+// expected-error @+1 {{cannot be converted to LLVM IR}}
 func @foo() {
   llvm.return
 }

diff  --git a/mlir/test/Target/LLVMIR/openmp-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
new file mode 100644
index 0000000000000..f9ec6f92f5a10
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
@@ -0,0 +1,418 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// Only check the overall shape of the code and the presence of relevant
+// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level.
+
+omp.reduction.declare @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = llvm.mlir.constant(0.0 : f32) : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = llvm.fadd %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+atomic {
+^bb2(%arg2: !llvm.ptr<f32>, %arg3: !llvm.ptr<f32>):
+  %2 = llvm.load %arg3 : !llvm.ptr<f32>
+  llvm.atomicrmw fadd %arg2, %2 monotonic : f32
+  omp.yield
+}
+
+// CHECK-LABEL: @simple_reduction
+llvm.func @simple_reduction(%lb : i64, %ub : i64, %step : i64) {
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
+  omp.parallel {
+    omp.wsloop (%iv) : i64 = (%lb) to (%ub) step (%step)
+    reduction(@add_f32 -> %0 : !llvm.ptr<f32>) {
+      %1 = llvm.mlir.constant(2.0 : f32) : f32
+      omp.reduction %1, %0 : !llvm.ptr<f32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// Call to the outlined function.
+// CHECK: call void {{.*}} @__kmpc_fork_call
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %[[PRIVATE:.+]] = alloca float
+// CHECK: store float 0.000000e+00, float* %[[PRIVATE]]
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Atomic reduction.
+// CHECK: %[[PARTIAL:.+]] = load float, float* %[[PRIVATE]]
+// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL]]
+
+// Non-atomic reduction:
+// CHECK: fadd float
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+// CHECK: call void @__kmpc_barrier
+
+// Update of the private variable using the reduction region
+// (the body block currently comes after all the other blocks).
+// CHECK: %[[PARTIAL:.+]] = load float, float* %[[PRIVATE]]
+// CHECK: %[[UPDATED:.+]] = fadd float %[[PARTIAL]], 2.000000e+00
+// CHECK: store float %[[UPDATED]], float* %[[PRIVATE]]
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: fadd float
+
+// -----
+
+omp.reduction.declare @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = llvm.mlir.constant(0.0 : f32) : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = llvm.fadd %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+atomic {
+^bb2(%arg2: !llvm.ptr<f32>, %arg3: !llvm.ptr<f32>):
+  %2 = llvm.load %arg3 : !llvm.ptr<f32>
+  llvm.atomicrmw fadd %arg2, %2 monotonic : f32
+  omp.yield
+}
+
+// When the same reduction declaration is used several times, its regions
+// are translated several times, which shouldn't lead to value/block
+// remapping assertions.
+// CHECK-LABEL: @reuse_declaration
+llvm.func @reuse_declaration(%lb : i64, %ub : i64, %step : i64) {
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
+  %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
+  omp.parallel {
+    omp.wsloop (%iv) : i64 = (%lb) to (%ub) step (%step)
+    reduction(@add_f32 -> %0 : !llvm.ptr<f32>, @add_f32 -> %2 : !llvm.ptr<f32>) {
+      %1 = llvm.mlir.constant(2.0 : f32) : f32
+      omp.reduction %1, %0 : !llvm.ptr<f32>
+      omp.reduction %1, %2 : !llvm.ptr<f32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// Call to the outlined function.
+// CHECK: call void {{.*}} @__kmpc_fork_call
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %[[PRIVATE1:.+]] = alloca float
+// CHECK: %[[PRIVATE2:.+]] = alloca float
+// CHECK: store float 0.000000e+00, float* %[[PRIVATE1]]
+// CHECK: store float 0.000000e+00, float* %[[PRIVATE2]]
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Atomic reduction.
+// CHECK: %[[PARTIAL1:.+]] = load float, float* %[[PRIVATE1]]
+// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL1]]
+// CHECK: %[[PARTIAL2:.+]] = load float, float* %[[PRIVATE2]]
+// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL2]]
+
+// Non-atomic reduction:
+// CHECK: fadd float
+// CHECK: fadd float
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+// CHECK: call void @__kmpc_barrier
+
+// Update of the private variable using the reduction region
+// (the body block currently comes after all the other blocks).
+// CHECK: %[[PARTIAL1:.+]] = load float, float* %[[PRIVATE1]]
+// CHECK: %[[UPDATED1:.+]] = fadd float %[[PARTIAL1]], 2.000000e+00
+// CHECK: store float %[[UPDATED1]], float* %[[PRIVATE1]]
+// CHECK: %[[PARTIAL2:.+]] = load float, float* %[[PRIVATE2]]
+// CHECK: %[[UPDATED2:.+]] = fadd float %[[PARTIAL2]], 2.000000e+00
+// CHECK: store float %[[UPDATED2]], float* %[[PRIVATE2]]
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: fadd float
+// CHECK: fadd float
+
+
+// -----
+
+omp.reduction.declare @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = llvm.mlir.constant(0.0 : f32) : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = llvm.fadd %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+atomic {
+^bb2(%arg2: !llvm.ptr<f32>, %arg3: !llvm.ptr<f32>):
+  %2 = llvm.load %arg3 : !llvm.ptr<f32>
+  llvm.atomicrmw fadd %arg2, %2 monotonic : f32
+  omp.yield
+}
+
+// It's okay not to reference the reduction variable in the body.
+// CHECK-LABEL: @missing_omp_reduction
+llvm.func @missing_omp_reduction(%lb : i64, %ub : i64, %step : i64) {
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
+  %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
+  omp.parallel {
+    omp.wsloop (%iv) : i64 = (%lb) to (%ub) step (%step)
+    reduction(@add_f32 -> %0 : !llvm.ptr<f32>, @add_f32 -> %2 : !llvm.ptr<f32>) {
+      %1 = llvm.mlir.constant(2.0 : f32) : f32
+      omp.reduction %1, %0 : !llvm.ptr<f32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// Call to the outlined function.
+// CHECK: call void {{.*}} @__kmpc_fork_call
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %[[PRIVATE1:.+]] = alloca float
+// CHECK: %[[PRIVATE2:.+]] = alloca float
+// CHECK: store float 0.000000e+00, float* %[[PRIVATE1]]
+// CHECK: store float 0.000000e+00, float* %[[PRIVATE2]]
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Atomic reduction.
+// CHECK: %[[PARTIAL1:.+]] = load float, float* %[[PRIVATE1]]
+// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL1]]
+// CHECK: %[[PARTIAL2:.+]] = load float, float* %[[PRIVATE2]]
+// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL2]]
+
+// Non-atomic reduction:
+// CHECK: fadd float
+// CHECK: fadd float
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+// CHECK: call void @__kmpc_barrier
+
+// Update of the private variable using the reduction region
+// (the body block currently comes after all the other blocks).
+// CHECK: %[[PARTIAL1:.+]] = load float, float* %[[PRIVATE1]]
+// CHECK: %[[UPDATED1:.+]] = fadd float %[[PARTIAL1]], 2.000000e+00
+// CHECK: store float %[[UPDATED1]], float* %[[PRIVATE1]]
+// CHECK-NOT: %{{.*}} = load float, float* %[[PRIVATE2]]
+// CHECK-NOT: %{{.*}} = fadd float %[[PARTIAL2]], 2.000000e+00
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: fadd float
+// CHECK: fadd float
+
+// -----
+
+omp.reduction.declare @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = llvm.mlir.constant(0.0 : f32) : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = llvm.fadd %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+atomic {
+^bb2(%arg2: !llvm.ptr<f32>, %arg3: !llvm.ptr<f32>):
+  %2 = llvm.load %arg3 : !llvm.ptr<f32>
+  llvm.atomicrmw fadd %arg2, %2 monotonic : f32
+  omp.yield
+}
+
+// It's okay to refer to the same reduction variable more than once in the
+// body.
+// CHECK-LABEL: @double_reference
+llvm.func @double_reference(%lb : i64, %ub : i64, %step : i64) {
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
+  omp.parallel {
+    omp.wsloop (%iv) : i64 = (%lb) to (%ub) step (%step)
+    reduction(@add_f32 -> %0 : !llvm.ptr<f32>) {
+      %1 = llvm.mlir.constant(2.0 : f32) : f32
+      omp.reduction %1, %0 : !llvm.ptr<f32>
+      omp.reduction %1, %0 : !llvm.ptr<f32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// Call to the outlined function.
+// CHECK: call void {{.*}} @__kmpc_fork_call
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %[[PRIVATE:.+]] = alloca float
+// CHECK: store float 0.000000e+00, float* %[[PRIVATE]]
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Atomic reduction.
+// CHECK: %[[PARTIAL:.+]] = load float, float* %[[PRIVATE]]
+// CHECK: atomicrmw fadd float* %{{.*}}, float %[[PARTIAL]]
+
+// Non-atomic reduction:
+// CHECK: fadd float
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+// CHECK: call void @__kmpc_barrier
+
+// Update of the private variable using the reduction region
+// (the body block currently comes after all the other blocks).
+// CHECK: %[[PARTIAL:.+]] = load float, float* %[[PRIVATE]]
+// CHECK: %[[UPDATED:.+]] = fadd float %[[PARTIAL]], 2.000000e+00
+// CHECK: store float %[[UPDATED]], float* %[[PRIVATE]]
+// CHECK: %[[PARTIAL:.+]] = load float, float* %[[PRIVATE]]
+// CHECK: %[[UPDATED:.+]] = fadd float %[[PARTIAL]], 2.000000e+00
+// CHECK: store float %[[UPDATED]], float* %[[PRIVATE]]
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: fadd float
+
+// -----
+
+omp.reduction.declare @add_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = llvm.mlir.constant(0.0 : f32) : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = llvm.fadd %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+atomic {
+^bb2(%arg2: !llvm.ptr<f32>, %arg3: !llvm.ptr<f32>):
+  %2 = llvm.load %arg3 : !llvm.ptr<f32>
+  llvm.atomicrmw fadd %arg2, %2 monotonic : f32
+  omp.yield
+}
+
+omp.reduction.declare @mul_f32 : f32
+init {
+^bb0(%arg: f32):
+  %0 = llvm.mlir.constant(1.0 : f32) : f32
+  omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+  %1 = llvm.fmul %arg0, %arg1 : f32
+  omp.yield (%1 : f32)
+}
+
+// CHECK-LABEL: @no_atomic
+llvm.func @no_atomic(%lb : i64, %ub : i64, %step : i64) {
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
+  %2 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr<f32>
+  omp.parallel {
+    omp.wsloop (%iv) : i64 = (%lb) to (%ub) step (%step)
+    reduction(@add_f32 -> %0 : !llvm.ptr<f32>, @mul_f32 -> %2 : !llvm.ptr<f32>) {
+      %1 = llvm.mlir.constant(2.0 : f32) : f32
+      omp.reduction %1, %0 : !llvm.ptr<f32>
+      omp.reduction %1, %2 : !llvm.ptr<f32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// Call to the outlined function.
+// CHECK: call void {{.*}} @__kmpc_fork_call
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %[[PRIVATE1:.+]] = alloca float
+// CHECK: %[[PRIVATE2:.+]] = alloca float
+// CHECK: store float 0.000000e+00, float* %[[PRIVATE1]]
+// CHECK: store float 1.000000e+00, float* %[[PRIVATE2]]
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Atomic reduction not provided.
+// CHECK: unreachable
+
+// Non-atomic reduction:
+// CHECK: fadd float
+// CHECK: fmul float
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+// CHECK: call void @__kmpc_barrier
+
+// Update of the private variable using the reduction region
+// (the body block currently comes after all the other blocks).
+// CHECK: %[[PARTIAL1:.+]] = load float, float* %[[PRIVATE1]]
+// CHECK: %[[UPDATED1:.+]] = fadd float %[[PARTIAL1]], 2.000000e+00
+// CHECK: store float %[[UPDATED1]], float* %[[PRIVATE1]]
+// CHECK: %[[PARTIAL2:.+]] = load float, float* %[[PRIVATE2]]
+// CHECK: %[[UPDATED2:.+]] = fmul float %[[PARTIAL2]], 2.000000e+00
+// CHECK: store float %[[UPDATED2]], float* %[[PRIVATE2]]
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: fadd float
+// CHECK: fmul float