[flang-commits] [flang] [llvm] [mlir] [flang] support fir.alloca operations inside of omp reduction ops (PR #84952)

Tom Eccles via flang-commits flang-commits at lists.llvm.org
Thu Mar 14 09:24:15 PDT 2024


https://github.com/tblah updated https://github.com/llvm/llvm-project/pull/84952

>From 958ea36de41ce928a0ecf5cee076293d2ae50474 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Fri, 9 Feb 2024 16:51:49 +0000
Subject: [PATCH 1/7] [flang][NFC] move loadIfRef to FIRBuilder

This will be useful for OpenMP too.

I changed the definition slightly to use `fir::isa_ref_type` (which also
includes llvm pointers) because I think it reads better using the common
type helpers. There shouldn't be any llvm pointers in lowering so this
isn't a functional change.

Commit series for by-ref openmp reductions: 1/3
---
 flang/include/flang/Optimizer/Builder/FIRBuilder.h |  4 ++++
 flang/lib/Lower/OpenACC.cpp                        | 12 ++----------
 flang/lib/Optimizer/Builder/FIRBuilder.cpp         |  7 +++++++
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index bd9b67b14b966c..d61bf681be6194 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -309,6 +309,10 @@ class FirOpBuilder : public mlir::OpBuilder, public mlir::OpBuilder::Listener {
   void createStoreWithConvert(mlir::Location loc, mlir::Value val,
                               mlir::Value addr);
 
+  /// Create a fir.load if \p val is a reference or pointer type. Return the
+  /// result of the load if it was created, otherwise return \p val
+  mlir::Value loadIfRef(mlir::Location loc, mlir::Value val);
+
   /// Create a new FuncOp. If the function may have already been created, use
   /// `addNamedFunction` instead.
   mlir::func::FuncOp createFunction(mlir::Location loc, llvm::StringRef name,
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 151077d81ba14a..d2c6006ecf914a 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1041,14 +1041,6 @@ static mlir::Value genLogicalCombiner(fir::FirOpBuilder &builder,
   return builder.create<fir::ConvertOp>(loc, value1.getType(), combined);
 }
 
-static mlir::Value loadIfRef(fir::FirOpBuilder &builder, mlir::Location loc,
-                             mlir::Value value) {
-  if (mlir::isa<fir::ReferenceType, fir::PointerType, fir::HeapType>(
-          value.getType()))
-    return builder.create<fir::LoadOp>(loc, value);
-  return value;
-}
-
 static mlir::Value genComparisonCombiner(fir::FirOpBuilder &builder,
                                          mlir::Location loc,
                                          mlir::arith::CmpIPredicate pred,
@@ -1066,8 +1058,8 @@ static mlir::Value genScalarCombiner(fir::FirOpBuilder &builder,
                                      mlir::acc::ReductionOperator op,
                                      mlir::Type ty, mlir::Value value1,
                                      mlir::Value value2) {
-  value1 = loadIfRef(builder, loc, value1);
-  value2 = loadIfRef(builder, loc, value2);
+  value1 = builder.loadIfRef(loc, value1);
+  value2 = builder.loadIfRef(loc, value2);
   if (op == mlir::acc::ReductionOperator::AccAdd) {
     if (ty.isIntOrIndex())
       return builder.create<mlir::arith::AddIOp>(loc, value1, value2);
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 788c99e40105a2..12da7412888a3b 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -16,6 +16,7 @@
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -404,6 +405,12 @@ void fir::FirOpBuilder::createStoreWithConvert(mlir::Location loc,
   create<fir::StoreOp>(loc, cast, addr);
 }
 
+mlir::Value fir::FirOpBuilder::loadIfRef(mlir::Location loc, mlir::Value val) {
+  if (fir::isa_ref_type(val.getType()))
+    return create<fir::LoadOp>(loc, val);
+  return val;
+}
+
 fir::StringLitOp fir::FirOpBuilder::createStringLitOp(mlir::Location loc,
                                                       llvm::StringRef data) {
   auto type = fir::CharacterType::get(getContext(), 1, data.size());

>From ba0182b73b1c6f8b91efa8d74b185b3bfc0f1cb6 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Thu, 15 Feb 2024 13:29:00 +0000
Subject: [PATCH 2/7] [flang][TBAABuilder] not all loads and stores are inside
 of functions

TBAA builder assumed that all loads/stores are inside of functions and
hit an assertion once it found loads and stores inside of an
omp::ReductionDeclareOp.

For now just don't add TBAA tags to those loads and stores. They would
end up in a different TBAA tree to the host function after
OpenMPIRBuilder inlines them anyway so there isn't an easy way of making
this work.

Commit series for by-ref OpenMP reductions: 2/3
---
 flang/lib/Optimizer/CodeGen/TBAABuilder.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp b/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
index 8e7f59f76383c9..b1b0e9b766a625 100644
--- a/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
+++ b/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
@@ -102,7 +102,8 @@ void TBAABuilder::attachTBAATag(AliasAnalysisOpInterface op, Type baseFIRType,
     return;
 
   mlir::LLVM::LLVMFuncOp func = op->getParentOfType<mlir::LLVM::LLVMFuncOp>();
-  assert(func && "func.func should have already been converted to llvm.func");
+  if (!func)
+    return;
 
   ++tagAttachmentCounter;
   if (tagAttachmentLimit != kTagAttachmentUnlimited &&

>From 07d9f417ecc8f0ea44969b6648218d4c72274aa6 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Tue, 6 Feb 2024 17:31:35 +0000
Subject: [PATCH 3/7] [flang][OpenMP][OMPIRBuilder][mlir] Optionally pass
 reduction vars by ref

Previously reduction variables were always passed by value into and out
of the initialization and combiner regions of the OpenMP reduction
declare operation.

This worked well for reductions of primitive types (and might perform
better than passing by reference). But passing by reference will be
useful for array and derived type reductions (e.g. to move allocation
inside of the init region).

Passing reductions by reference requires different LLVM-IR generation
when lowering from MLIR because some of the loads/stores/allocations will
now be moved inside of the init and combiner regions. This alternate
code generation is requested using a new attribute to omp.wsloop and
omp.parallel.

Existing lowerings from mlir are unaffected (these will continue to use
the by-value argument passing.

Flang will continue to pass by-value argument passing for trivial types
unless a (hidden) command line argument is supplied. Non-trivial types
will always use the by-ref lowering.

Array reductions are not ready yet (but are coming very soon). In the
meantime, this is tested by forcing existing reductions to use by-ref.

Commit series for by-ref OpenMP reductions 3/3

Co-authored-by: Mats Petersson <mats.petersson at arm.com>
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  18 +-
 flang/lib/Lower/OpenMP/ReductionProcessor.cpp | 120 +++--
 flang/lib/Lower/OpenMP/ReductionProcessor.h   |  18 +-
 .../FIR/parallel-reduction-add-byref.f90      | 117 +++++
 .../OpenMP/FIR/wsloop-reduction-add-byref.f90 | 392 ++++++++++++++++
 .../FIR/wsloop-reduction-iand-byref.f90       |  46 ++
 .../FIR/wsloop-reduction-ieor-byref.f90       |  45 ++
 .../OpenMP/FIR/wsloop-reduction-ior-byref.f90 |  45 ++
 .../wsloop-reduction-logical-eqv-byref.f90    | 187 ++++++++
 .../wsloop-reduction-logical-neqv-byref.f90   | 189 ++++++++
 .../OpenMP/FIR/wsloop-reduction-max-byref.f90 |  90 ++++
 .../OpenMP/FIR/wsloop-reduction-min-byref.f90 |  91 ++++
 .../Lower/OpenMP/default-clause-byref.f90     | 385 ++++++++++++++++
 .../delayed-privatization-reduction-byref.f90 |  30 ++
 .../OpenMP/parallel-reduction-add-byref.f90   | 125 +++++
 .../Lower/OpenMP/parallel-reduction-byref.f90 |  44 ++
 .../parallel-wsloop-reduction-byref.f90       |  16 +
 .../OpenMP/wsloop-reduction-add-byref.f90     | 433 ++++++++++++++++++
 .../wsloop-reduction-add-hlfir-byref.f90      |  58 +++
 .../OpenMP/wsloop-reduction-iand-byref.f90    |  64 +++
 .../OpenMP/wsloop-reduction-ieor-byref.f90    |  55 +++
 .../OpenMP/wsloop-reduction-ior-byref.f90     |  64 +++
 .../wsloop-reduction-logical-and-byref.f90    | 206 +++++++++
 .../wsloop-reduction-logical-eqv-byref.f90    | 202 ++++++++
 .../wsloop-reduction-logical-neqv-byref.f90   | 207 +++++++++
 .../wsloop-reduction-logical-or-byref.f90     | 204 +++++++++
 .../OpenMP/wsloop-reduction-max-2-byref.f90   |  20 +
 .../OpenMP/wsloop-reduction-max-byref.f90     | 152 ++++++
 .../wsloop-reduction-max-hlfir-byref.f90      |  62 +++
 .../OpenMP/wsloop-reduction-min-byref.f90     | 154 +++++++
 .../OpenMP/wsloop-reduction-mul-byref.f90     | 414 +++++++++++++++++
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   4 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  30 +-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  12 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |   5 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  99 ++--
 .../Target/LLVMIR/openmp-reduction-byref.mlir |  66 +++
 37 files changed, 4393 insertions(+), 76 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/default-clause-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-reduction-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
 create mode 100644 mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 185e0316870e94..d9648f3d692cc6 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -600,6 +600,10 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
     return reductionSymbols;
   };
 
+  mlir::UnitAttr byrefAttr;
+  if (ReductionProcessor::doReductionByRef(reductionVars))
+    byrefAttr = converter.getFirOpBuilder().getUnitAttr();
+
   OpWithBodyGenInfo genInfo =
       OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
@@ -619,7 +623,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
             : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
                                    reductionDeclSymbols),
         procBindKindAttr, /*private_vars=*/llvm::SmallVector<mlir::Value>{},
-        /*privatizers=*/nullptr);
+        /*privatizers=*/nullptr, byrefAttr);
   }
 
   bool privatize = !outerCombined;
@@ -683,7 +687,8 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
       delayedPrivatizationInfo.privatizers.empty()
           ? nullptr
           : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
-                                 privatizers));
+                                 privatizers),
+      byrefAttr);
 }
 
 static mlir::omp::SectionOp
@@ -1568,7 +1573,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
   mlir::omp::ClauseOrderKindAttr orderClauseOperand;
   mlir::omp::ClauseScheduleKindAttr scheduleValClauseOperand;
-  mlir::UnitAttr nowaitClauseOperand, scheduleSimdClauseOperand;
+  mlir::UnitAttr nowaitClauseOperand, byrefOperand, scheduleSimdClauseOperand;
   mlir::IntegerAttr orderedClauseOperand;
   mlir::omp::ScheduleModifierAttr scheduleModClauseOperand;
   std::size_t loopVarTypeSize;
@@ -1585,6 +1590,9 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   convertLoopBounds(converter, loc, lowerBound, upperBound, step,
                     loopVarTypeSize);
 
+  if (ReductionProcessor::doReductionByRef(reductionVars))
+    byrefOperand = firOpBuilder.getUnitAttr();
+
   auto wsLoopOp = firOpBuilder.create<mlir::omp::WsLoopOp>(
       loc, lowerBound, upperBound, step, linearVars, linearStepVars,
       reductionVars,
@@ -1594,8 +1602,8 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
                                  reductionDeclSymbols),
       scheduleValClauseOperand, scheduleChunkClauseOperand,
       /*schedule_modifiers=*/nullptr,
-      /*simd_modifier=*/nullptr, nowaitClauseOperand, orderedClauseOperand,
-      orderClauseOperand,
+      /*simd_modifier=*/nullptr, nowaitClauseOperand, byrefOperand,
+      orderedClauseOperand, orderClauseOperand,
       /*inclusive=*/firOpBuilder.getUnitAttr());
 
   // Handle attribute based clauses.
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index a8b98f3f567249..34959e95dce04a 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -14,9 +14,16 @@
 
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Parser/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "llvm/Support/CommandLine.h"
+
+static llvm::cl::opt<bool> forceByrefReduction(
+    "force-byref-reduction",
+    llvm::cl::desc("Pass all reduction arguments by reference"),
+    llvm::cl::Hidden);
 
 namespace Fortran {
 namespace lower {
@@ -76,16 +83,24 @@ bool ReductionProcessor::supportedIntrinsicProcReduction(
 }
 
 std::string ReductionProcessor::getReductionName(llvm::StringRef name,
-                                                 mlir::Type ty) {
+                                                 mlir::Type ty, bool isByRef) {
+  ty = fir::unwrapRefType(ty);
+
+  // extra string to distinguish reduction functions for variables passed by
+  // reference
+  llvm::StringRef byrefAddition{""};
+  if (isByRef)
+    byrefAddition = "_byref";
+
   return (llvm::Twine(name) +
           (ty.isIntOrIndex() ? llvm::Twine("_i_") : llvm::Twine("_f_")) +
-          llvm::Twine(ty.getIntOrFloatBitWidth()))
+          llvm::Twine(ty.getIntOrFloatBitWidth()) + byrefAddition)
       .str();
 }
 
 std::string ReductionProcessor::getReductionName(
     Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-    mlir::Type ty) {
+    mlir::Type ty, bool isByRef) {
   std::string reductionName;
 
   switch (intrinsicOp) {
@@ -108,13 +123,14 @@ std::string ReductionProcessor::getReductionName(
     break;
   }
 
-  return getReductionName(reductionName, ty);
+  return getReductionName(reductionName, ty, isByRef);
 }
 
 mlir::Value
 ReductionProcessor::getReductionInitValue(mlir::Location loc, mlir::Type type,
                                           ReductionIdentifier redId,
                                           fir::FirOpBuilder &builder) {
+  type = fir::unwrapRefType(type);
   assert((fir::isa_integer(type) || fir::isa_real(type) ||
           type.isa<fir::LogicalType>()) &&
          "only integer, logical and real types are currently supported");
@@ -188,6 +204,7 @@ mlir::Value ReductionProcessor::createScalarCombiner(
     fir::FirOpBuilder &builder, mlir::Location loc, ReductionIdentifier redId,
     mlir::Type type, mlir::Value op1, mlir::Value op2) {
   mlir::Value reductionOp;
+  type = fir::unwrapRefType(type);
   switch (redId) {
   case ReductionIdentifier::MAX:
     reductionOp =
@@ -268,7 +285,8 @@ mlir::Value ReductionProcessor::createScalarCombiner(
 
 mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
     fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
-    const ReductionIdentifier redId, mlir::Type type, mlir::Location loc) {
+    const ReductionIdentifier redId, mlir::Type type, mlir::Location loc,
+    bool isByRef) {
   mlir::OpBuilder::InsertionGuard guard(builder);
   mlir::ModuleOp module = builder.getModule();
 
@@ -278,14 +296,24 @@ mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
     return decl;
 
   mlir::OpBuilder modBuilder(module.getBodyRegion());
+  mlir::Type valTy = fir::unwrapRefType(type);
+  if (!isByRef)
+    type = valTy;
 
   decl = modBuilder.create<mlir::omp::ReductionDeclareOp>(loc, reductionOpName,
                                                           type);
   builder.createBlock(&decl.getInitializerRegion(),
                       decl.getInitializerRegion().end(), {type}, {loc});
   builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
+
   mlir::Value init = getReductionInitValue(loc, type, redId, builder);
-  builder.create<mlir::omp::YieldOp>(loc, init);
+  if (isByRef) {
+    mlir::Value alloca = builder.create<fir::AllocaOp>(loc, valTy);
+    builder.createStoreWithConvert(loc, init, alloca);
+    builder.create<mlir::omp::YieldOp>(loc, alloca);
+  } else {
+    builder.create<mlir::omp::YieldOp>(loc, init);
+  }
 
   builder.createBlock(&decl.getReductionRegion(),
                       decl.getReductionRegion().end(), {type, type},
@@ -294,14 +322,41 @@ mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
   builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
   mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
   mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
+  mlir::Value outAddr = op1;
+
+  op1 = builder.loadIfRef(loc, op1);
+  op2 = builder.loadIfRef(loc, op2);
 
   mlir::Value reductionOp =
       createScalarCombiner(builder, loc, redId, type, op1, op2);
-  builder.create<mlir::omp::YieldOp>(loc, reductionOp);
+  if (isByRef) {
+    builder.create<fir::StoreOp>(loc, reductionOp, outAddr);
+    builder.create<mlir::omp::YieldOp>(loc, outAddr);
+  } else {
+    builder.create<mlir::omp::YieldOp>(loc, reductionOp);
+  }
 
   return decl;
 }
 
+bool ReductionProcessor::doReductionByRef(
+    const llvm::SmallVectorImpl<mlir::Value> &reductionVars) {
+  if (reductionVars.empty())
+    return false;
+  if (forceByrefReduction)
+    return true;
+
+  for (mlir::Value reductionVar : reductionVars) {
+    if (auto declare =
+            mlir::dyn_cast<hlfir::DeclareOp>(reductionVar.getDefiningOp()))
+      reductionVar = declare.getMemref();
+
+    if (!fir::isa_trivial(fir::unwrapRefType(reductionVar.getType())))
+      return true;
+  }
+  return false;
+}
+
 void ReductionProcessor::addReductionDecl(
     mlir::Location currentLocation,
     Fortran::lower::AbstractConverter &converter,
@@ -315,6 +370,24 @@ void ReductionProcessor::addReductionDecl(
   const auto &redOperator{
       std::get<Fortran::parser::OmpReductionOperator>(reduction.t)};
   const auto &objectList{std::get<Fortran::parser::OmpObjectList>(reduction.t)};
+
+  // initial pass to collect all recuction vars so we can figure out if this
+  // should happen byref
+  for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
+    if (const auto *name{
+            Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
+      if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
+        if (reductionSymbols)
+          reductionSymbols->push_back(symbol);
+        mlir::Value symVal = converter.getSymbolAddress(*symbol);
+        if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
+          symVal = declOp.getBase();
+        reductionVars.push_back(symVal);
+      }
+    }
+  }
+  const bool isByRef = doReductionByRef(reductionVars);
+
   if (const auto &redDefinedOp =
           std::get_if<Fortran::parser::DefinedOperator>(&redOperator.u)) {
     const auto &intrinsicOp{
@@ -338,23 +411,20 @@ void ReductionProcessor::addReductionDecl(
       if (const auto *name{
               Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
         if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-          if (reductionSymbols)
-            reductionSymbols->push_back(symbol);
           mlir::Value symVal = converter.getSymbolAddress(*symbol);
           if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
             symVal = declOp.getBase();
-          mlir::Type redType =
-              symVal.getType().cast<fir::ReferenceType>().getEleTy();
-          reductionVars.push_back(symVal);
-          if (redType.isa<fir::LogicalType>())
+          auto redType = symVal.getType().cast<fir::ReferenceType>();
+          if (redType.getEleTy().isa<fir::LogicalType>())
             decl = createReductionDecl(
                 firOpBuilder,
-                getReductionName(intrinsicOp, firOpBuilder.getI1Type()), redId,
-                redType, currentLocation);
-          else if (redType.isIntOrIndexOrFloat()) {
-            decl = createReductionDecl(firOpBuilder,
-                                       getReductionName(intrinsicOp, redType),
-                                       redId, redType, currentLocation);
+                getReductionName(intrinsicOp, firOpBuilder.getI1Type(),
+                                 isByRef),
+                redId, redType, currentLocation, isByRef);
+          else if (redType.getEleTy().isIntOrIndexOrFloat()) {
+            decl = createReductionDecl(
+                firOpBuilder, getReductionName(intrinsicOp, redType, isByRef),
+                redId, redType, currentLocation, isByRef);
           } else {
             TODO(currentLocation, "Reduction of some types is not supported");
           }
@@ -374,21 +444,17 @@ void ReductionProcessor::addReductionDecl(
         if (const auto *name{
                 Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
           if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-            if (reductionSymbols)
-              reductionSymbols->push_back(symbol);
             mlir::Value symVal = converter.getSymbolAddress(*symbol);
             if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
               symVal = declOp.getBase();
-            mlir::Type redType =
-                symVal.getType().cast<fir::ReferenceType>().getEleTy();
-            reductionVars.push_back(symVal);
-            assert(redType.isIntOrIndexOrFloat() &&
+            auto redType = symVal.getType().cast<fir::ReferenceType>();
+            assert(redType.getEleTy().isIntOrIndexOrFloat() &&
                    "Unsupported reduction type");
             decl = createReductionDecl(
                 firOpBuilder,
                 getReductionName(getRealName(*reductionIntrinsic).ToString(),
-                                 redType),
-                redId, redType, currentLocation);
+                                 redType, isByRef),
+                redId, redType, currentLocation, isByRef);
             reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
                 firOpBuilder.getContext(), decl.getSymName()));
           }
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.h b/flang/lib/Lower/OpenMP/ReductionProcessor.h
index 00770fe81d1ef6..679580f2a3cac7 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.h
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.h
@@ -14,6 +14,7 @@
 #define FORTRAN_LOWER_REDUCTIONPROCESSOR_H
 
 #include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/type.h"
@@ -71,11 +72,15 @@ class ReductionProcessor {
   static const Fortran::semantics::SourceName
   getRealName(const Fortran::parser::ProcedureDesignator &pd);
 
-  static std::string getReductionName(llvm::StringRef name, mlir::Type ty);
+  static bool
+  doReductionByRef(const llvm::SmallVectorImpl<mlir::Value> &reductionVars);
+
+  static std::string getReductionName(llvm::StringRef name, mlir::Type ty,
+                                      bool isByRef);
 
   static std::string getReductionName(
       Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-      mlir::Type ty);
+      mlir::Type ty, bool isByRef);
 
   /// This function returns the identity value of the operator \p
   /// reductionOpName. For example:
@@ -103,9 +108,11 @@ class ReductionProcessor {
   /// symbol table. The declaration has a constant initializer with the neutral
   /// value `initValue`, and the reduction combiner carried over from `reduce`.
   /// TODO: Generalize this for non-integer types, add atomic region.
-  static mlir::omp::ReductionDeclareOp createReductionDecl(
-      fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
-      const ReductionIdentifier redId, mlir::Type type, mlir::Location loc);
+  static mlir::omp::ReductionDeclareOp
+  createReductionDecl(fir::FirOpBuilder &builder,
+                      llvm::StringRef reductionOpName,
+                      const ReductionIdentifier redId, mlir::Type type,
+                      mlir::Location loc, bool isByRef);
 
   /// Creates a reduction declaration and associates it with an OpenMP block
   /// directive.
@@ -124,6 +131,7 @@ mlir::Value
 ReductionProcessor::getReductionOperation(fir::FirOpBuilder &builder,
                                           mlir::Type type, mlir::Location loc,
                                           mlir::Value op1, mlir::Value op2) {
+  type = fir::unwrapRefType(type);
   assert(type.isIntOrIndexOrFloat() &&
          "only integer and float types are currently supported");
   if (type.isIntOrIndex())
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90
new file mode 100644
index 00000000000000..ca432662b77c44
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90
@@ -0,0 +1,117 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_F32_NAME:.*]] : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  %[[REF:.*]] = fir.alloca f32
+!CHECKL  fir.store [[%C0_1]] to %[[REF]] : !fir.ref<f32>
+!CHECK:  omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:  %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<f32>)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_I32_NAME:.*]] : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
+!CHECK:  %[[REF:.*]] = fir.alloca i32
+!CHECKL  fir.store [[%C0_1]] to %[[REF]] : !fir.ref<i32>
+!CHECK:  omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:  %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
+!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<i32>)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_int_add
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
+!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
+!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
+!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV:.+]] : !fir.ref<i32>) {
+!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK:    %[[I_INCR:.+]] = arith.constant 1 : i32
+!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]]
+!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine simple_int_add
+    integer :: i
+    i = 0
+
+    !$omp parallel reduction(+:i)
+    i = i + 1
+    !$omp end parallel
+
+    print *, i
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_real_add
+!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
+!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
+!CHECK:  omp.parallel byref reduction(@[[RED_F32_NAME]] %[[RREF]] -> %[[PRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
+!CHECK:    %[[R_INCR:.+]] = arith.constant 1.500000e+00 : f32
+!CHECK:    %[[RES]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
+!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<f32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine simple_real_add
+    real :: r
+    r = 0.0
+
+    !$omp parallel reduction(+:r)
+    r = r + 1.5
+    !$omp end parallel
+
+    print *, r
+end subroutine
+
+!CHECK-LABEL: func.func @_QPint_real_add
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFint_real_addEi"}
+!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFint_real_addEr"}
+!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
+!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
+!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
+!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV0:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[RREF]] -> %[[PRV1:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
+!CHECK:    %[[LPRV1:.+]] = fir.load %[[PRV1]] : !fir.ref<f32>
+!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[LPRV1]] {{.*}} : f32
+!CHECK:    fir.store %[[RES1]] to %[[PRV1]]
+!CHECK:    %[[LPRV0:.+]] = fir.load %[[PRV0]] : !fir.ref<i32>
+!CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
+!CHECK:    %[[RES0:.+]] = arith.addi %[[LPRV0]], %[[I_INCR]]
+!CHECK:    fir.store %[[RES0]] to %[[PRV0]]
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine int_real_add
+    real :: r
+    integer :: i
+
+    r = 0.0
+    i = 0
+
+    !$omp parallel reduction(+:i,r)
+    r = 1.5 + r
+    i = i + 3
+    !$omp end parallel
+
+    print *, r
+    print *, i
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90
new file mode 100644
index 00000000000000..d4d3452d3e8682
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90
@@ -0,0 +1,392 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_f_64_byref : !fir.ref<f64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f64>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:            %[[REF:.*]] = fir.alloca f64
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f64>, %[[ARG1:.*]]: !fir.ref<f64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f64>
+! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_64_byref : !fir.ref<i64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i64>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i64
+! CHECK:            %[[REF:.*]] = fir.alloca i64
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i64>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i64>, %[[ARG1:.*]]: !fir.ref<i64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i64>
+! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_f_32_byref : !fir.ref<f32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:            %[[REF:.*]] = fir.alloca f32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f32>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_int_reduction
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> f32
+! CHECK:               %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_real_reduction
+  real :: x
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_int_reduction_switch_order
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = i + x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> f32
+! CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
+! CHECK:               %[[VAL_12:.*]] = arith.addf %[[VAL_10]], %[[VAL_11]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_real_reduction_switch_order
+  real :: x
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = i + x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_int_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_int_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<i32>, @add_reduction_i_32_byref %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<i32>, @add_reduction_i_32_byref %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<i32>)  for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_16]] : i32
+! CHECK:               fir.store %[[VAL_17]] to %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_12]] : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               fir.store %[[VAL_20]] to %[[VAL_12]] : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_22]] : i32
+! CHECK:               fir.store %[[VAL_23]] to %[[VAL_13]] : !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine multiple_int_reductions_same_type
+  integer :: x,y,z
+  x = 0
+  y = 0
+  z = 0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_real_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_real_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
+! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<f32>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<f32>, @add_reduction_f_32_byref %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<f32>, @add_reduction_f_32_byref %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<f32>)  for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<f32>
+! CHECK:               %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i32) -> f32
+! CHECK:               %[[VAL_18:.*]] = arith.addf %[[VAL_15]], %[[VAL_17]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<f32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_12]] : !fir.ref<f32>
+! CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> f32
+! CHECK:               %[[VAL_22:.*]] = arith.addf %[[VAL_19]], %[[VAL_21]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_22]] to %[[VAL_12]] : !fir.ref<f32>
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> f32
+! CHECK:               %[[VAL_26:.*]] = arith.addf %[[VAL_23]], %[[VAL_25]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_26]] to %[[VAL_13]] : !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine multiple_real_reductions_same_type
+  real :: x,y,z
+  x = 0.0
+  y = 0.0
+  z = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions_different_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductions_different_typeEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
+! CHECK:           %[[VAL_4:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
+! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i64>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_7]] to %[[VAL_4]] : !fir.ref<f32>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_1]] : !fir.ref<f64>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_9:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_2]] -> %[[VAL_13:.*]] : !fir.ref<i32>, @add_reduction_i_64_byref %[[VAL_3]] -> %[[VAL_14:.*]] : !fir.ref<i64>, @add_reduction_f_32_byref %[[VAL_4]] -> %[[VAL_15:.*]] : !fir.ref<f32>, @add_reduction_f_64_byref %[[VAL_1]] -> %[[VAL_16:.*]] : !fir.ref<f64>)  for  (%[[VAL_17:.*]]) : i32 = (%[[VAL_10]]) to (%[[VAL_11]]) inclusive step (%[[VAL_12]]) {
+! CHECK:               fir.store %[[VAL_17]] to %[[VAL_9]] : !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               fir.store %[[VAL_20]] to %[[VAL_13]] : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_14]] : !fir.ref<i64>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i32) -> i64
+! CHECK:               %[[VAL_24:.*]] = arith.addi %[[VAL_21]], %[[VAL_23]] : i64
+! CHECK:               fir.store %[[VAL_24]] to %[[VAL_14]] : !fir.ref<i64>
+! CHECK:               %[[VAL_25:.*]] = fir.load %[[VAL_15]] : !fir.ref<f32>
+! CHECK:               %[[VAL_26:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (i32) -> f32
+! CHECK:               %[[VAL_28:.*]] = arith.addf %[[VAL_25]], %[[VAL_27]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_28]] to %[[VAL_15]] : !fir.ref<f32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_16]] : !fir.ref<f64>
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:               %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i32) -> f64
+! CHECK:               %[[VAL_32:.*]] = arith.addf %[[VAL_29]], %[[VAL_31]] fastmath<contract> : f64
+! CHECK:               fir.store %[[VAL_32]] to %[[VAL_16]] : !fir.ref<f64>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+
+subroutine multiple_reductions_different_type
+  integer :: x
+  integer(kind=8) :: y
+  real :: z
+  real(kind=8) :: w
+  x = 0
+  y = 0
+  z = 0.0
+  w = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z,w)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+    w = w + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90
new file mode 100644
index 00000000000000..00dfad69248bc5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90
@@ -0,0 +1,46 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+!CHECK-LABEL:   omp.reduction.declare @iand_i_32_byref : !fir.ref<i32>
+!CHECK-SAME:    init {
+!CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+!CHECK:            %[[C0_1:.*]] = arith.constant -1 : i32
+!CHECK:            %[[REF:.*]] = fir.alloca i32
+!CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+!CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+!CHECK-LABEL:   } combiner {
+!CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:           %[[RES:.*]] = arith.andi %[[LD0]], %[[LD1]] : i32
+!CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+!CHECK:         }
+
+
+!CHECK-LABEL: @_QPreduction_iand
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
+!CHECK: omp.parallel
+!CHECK: omp.wsloop byref reduction(@iand_i_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>) for
+!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: %[[RES:.+]] = arith.andi %[[LPRV]], %[[Y_I]] : i32
+!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_iand(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(iand:x)
+  do i=1, 100
+    x = iand(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90
new file mode 100644
index 00000000000000..fb35c405ca1d54
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90
@@ -0,0 +1,45 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -mmlir --force-byref-reduction -fopenmp %s -o - | FileCheck %s
+
+! CHECK-LABEL:   omp.reduction.declare @ieor_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.xori %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+!CHECK-LABEL: @_QPreduction_ieor
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
+!CHECK: omp.parallel
+!CHECK: omp.wsloop byref reduction(@ieor_i_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>) for
+!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: %[[RES:.+]] = arith.xori %[[LPRV]], %[[Y_I]] : i32
+!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_ieor(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(ieor:x)
+  do i=1, 100
+    x = ieor(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90
new file mode 100644
index 00000000000000..0365eff3fd8136
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90
@@ -0,0 +1,45 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! CHECK-LABEL:   omp.reduction.declare @ior_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.ori %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+!CHECK-LABEL: @_QPreduction_ior
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> 
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
+!CHECK: omp.parallel
+!CHECK: omp.wsloop byref reduction(@ior_i_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>) for 
+!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: %[[RES:.+]] = arith.ori %[[LPRV]], %[[Y_I]] : i32
+!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_ior(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(ior:x)
+  do i=1, 100
+    x = ior(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90
new file mode 100644
index 00000000000000..84219b34f964f4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90
@@ -0,0 +1,187 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @eqv_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant true
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_3:.*]] = arith.constant true
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
+! CHECK:               %[[VAL_14:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
+! CHECK:               %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x)
+  do i=1, 100
+    x = x .eqv. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]] = arith.constant true
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
+! CHECK:               %[[VAL_13:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
+! CHECK:               %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x)
+  do i=1, 100
+  x = y(i) .eqv. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_5:.*]] = arith.constant true
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_7:.*]] = arith.constant true
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_9:.*]] = arith.constant true
+! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
+! CHECK:               %[[VAL_22:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
+! CHECK:               %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_28:.*]] = arith.cmpi eq, %[[VAL_26]], %[[VAL_27]] : i1
+! CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
+! CHECK:               %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_39:.*]] = arith.cmpi eq, %[[VAL_37]], %[[VAL_38]] : i1
+! CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
+! CHECK:               %[[VAL_44:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
+! CHECK:               %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_50:.*]] = arith.cmpi eq, %[[VAL_48]], %[[VAL_49]] : i1
+! CHECK:               %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x,y,z)
+  do i=1, 100
+  x = x .eqv. w(i)
+  y = y .eqv. w(i)
+  z = z .eqv. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90
new file mode 100644
index 00000000000000..ec4d8500850b48
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90
@@ -0,0 +1,189 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+
+! CHECK-LABEL:   omp.reduction.declare @neqv_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant false
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.cmpi ne, %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_3:.*]] = arith.constant true
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
+! CHECK:               %[[VAL_14:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
+! CHECK:               %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x)
+  do i=1, 100
+    x = x .neqv. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]] = arith.constant true
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
+! CHECK:               %[[VAL_13:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
+! CHECK:               %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x)
+  do i=1, 100
+  x = y(i) .neqv. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_5:.*]] = arith.constant true
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_7:.*]] = arith.constant true
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_9:.*]] = arith.constant true
+! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
+! CHECK:               %[[VAL_22:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
+! CHECK:               %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_26]], %[[VAL_27]] : i1
+! CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
+! CHECK:               %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_38]] : i1
+! CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
+! CHECK:               %[[VAL_44:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
+! CHECK:               %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_50:.*]] = arith.cmpi ne, %[[VAL_48]], %[[VAL_49]] : i1
+! CHECK:               %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x,y,z)
+  do i=1, 100
+  x = x .neqv. w(i)
+  y = y .neqv. w(i)
+  z = z .neqv. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90
new file mode 100644
index 00000000000000..ddda24a17a8311
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90
@@ -0,0 +1,90 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+!CHECK: omp.reduction.declare @max_f_32_byref : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -3.40282347E+38 : f32
+!CHECK:   %[[REF:.*]] = fir.alloca f32
+!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:   %[[RES:.*]] = arith.maximumf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
+
+!CHECK-LABEL: omp.reduction.declare @max_i_32_byref : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32
+!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:   %[[RES:.*]] = arith.maxsi %[[LD0]], %[[LD1]] : i32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
+
+!CHECK-LABEL: @_QPreduction_max_int
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
+!CHECK:   omp.parallel
+!CHECK:     omp.wsloop byref reduction(@max_i_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>) for
+!CHECK:       %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK:       %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK:       %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK:       %[[RES:.+]] = arith.cmpi sgt, %[[LPRV]], %[[Y_I]] : i32
+!CHECK:       %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
+!CHECK:       fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
+!CHECK:     omp.terminator
+
+!CHECK-LABEL: @_QPreduction_max_real
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
+!CHECK:   omp.parallel
+!CHECK:     omp.wsloop byref reduction(@max_f_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>) for
+!CHECK:       %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
+!CHECK:       %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK:       %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
+!CHECK:       %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
+!CHECK:       omp.yield
+!CHECK:     omp.terminator
+
+subroutine reduction_max_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
+
+subroutine reduction_max_real(y)
+  real :: x, y(:)
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(y(i), x)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    !CHECK-NOT: omp.reduction
+    if (y(i) .gt. x) x = y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90
new file mode 100644
index 00000000000000..d767c54cdbc5f8
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90
@@ -0,0 +1,91 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -mmlir --force-byref-reduction -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK: omp.reduction.declare @min_f_32_byref : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 3.40282347E+38 : f32
+!CHECK:   %[[REF:.*]] = fir.alloca f32
+!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:   %[[RES:.*]] = arith.minimumf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
+
+!CHECK-LABEL: omp.reduction.declare @min_i_32_byref : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 2147483647 : i32
+!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:   %[[RES:.*]] = arith.minsi %[[LD0]], %[[LD1]] : i32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
+
+!CHECK-LABEL: @_QPreduction_min_int
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
+!CHECK:   omp.parallel
+!CHECK:     omp.wsloop byref reduction(@min_i_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>) for
+!CHECK:       %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK:       %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK:       %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK:       %[[RES:.+]] = arith.cmpi slt, %[[LPRV]], %[[Y_I]] : i32
+!CHECK:       %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
+!CHECK:       fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
+!CHECK:       omp.yield
+!CHECK:     omp.terminator
+
+!CHECK-LABEL: @_QPreduction_min_real
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
+!CHECK:   omp.parallel
+!CHECK:     omp.wsloop byref reduction(@min_f_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>) for
+!CHECK:       %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
+!CHECK:       %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK:       %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
+!CHECK:       %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
+!CHECK:       omp.yield
+!CHECK:     omp.terminator
+
+subroutine reduction_min_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    x = min(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
+
+subroutine reduction_min_real(y)
+  real :: x, y(:)
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    x = min(y(i), x)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+  !CHECK-NOT: omp.reduction
+    if (y(i) .gt. x) x = y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/default-clause-byref.f90 b/flang/test/Lower/OpenMP/default-clause-byref.f90
new file mode 100644
index 00000000000000..5d9538e53069d6
--- /dev/null
+++ b/flang/test/Lower/OpenMP/default-clause-byref.f90
@@ -0,0 +1,385 @@
+! This test checks lowering of OpenMP parallel directive
+! with `DEFAULT` clause present.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+! RUN: bbc -fopenmp -emit-hlfir --force-byref-reduction %s -o - | FileCheck %s
+
+
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} {
+!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"}
+!CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
+!CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CONST:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[CONST]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CONST:.*]] = arith.constant 2 : i32
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[RESULT:.*]] = arith.muli %[[CONST]], %[[TEMP]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_W_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 45 : i32
+!CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+program default_clause_lowering
+    integer :: x, y, z, w
+
+    !$omp parallel default(private) firstprivate(x) shared(z)
+        x = y * 2
+        z = w + 45
+    !$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(shared)
+        x = y
+    !$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(none) private(x, y)
+        x = y
+    !$omp end parallel
+
+!CHECK: omp.parallel   {
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(firstprivate) firstprivate(y)
+        x = y
+    !$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[W_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_W_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 2 : i32
+!CHECK: %[[RESULT:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = arith.muli %[[CONST]], %[[RESULT]] : i32
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_W_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 45 : i32
+!CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(firstprivate) private(x) shared(z)
+        x = y * 2
+        z = w + 45
+    !$omp end parallel
+
+!CHECK: omp.parallel   {
+!CHECK: omp.parallel   {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[W_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_W_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+    !$omp parallel
+        !$omp parallel default(private)
+            x = y
+        !$omp end parallel
+
+        !$omp parallel default(firstprivate)
+            w = x
+        !$omp end parallel
+    !$omp end parallel
+
+end program default_clause_lowering
+
+subroutine nested_default_clause_tests
+    integer :: x, y, z, w, k, a
+!CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFnested_default_clause_testsEk"}
+!CHECK: %[[K_DECL:.*]]:2 = hlfir.declare %[[K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel   {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
+!CHECK: %[[PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CONST:.*]] = arith.constant 20 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_Y_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 10 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel   {
+!CHECK: %[[INNER_PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[INNER_PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[INNER_PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Z_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
+!CHECK: %[[INNER_PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_K_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_K_DECL]]#0 temporary_lhs : i32, !fir.ref<i32> 
+!CHECK: %[[CONST:.*]] = arith.constant 30 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[PRIVATE_Y_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 40 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 50 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 40 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_K_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+    !$omp parallel  firstprivate(x) private(y) shared(w) default(private)  
+        !$omp parallel default(private)
+           y = 20
+           x = 10 
+        !$omp end parallel 
+
+        !$omp parallel default(firstprivate) shared(y) private(w) 
+            y = 30
+            w = 40 
+            z = 50
+            k = 40
+        !$omp end parallel
+    !$omp end parallel
+    
+    
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_INNER_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_INNER_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_INNER_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[PRIVATE_INNER_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP_1:.*]] = fir.load %[[PRIVATE_INNER_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP_2:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[RESULT:.*]] = arith.addi %{{.*}}, %{{.*}} : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_INNER_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+    !$omp parallel default(private)
+        !$omp parallel default(firstprivate)
+            x = y
+        !$omp end parallel
+
+        !$omp parallel default(private) shared(z)
+            w = x + z
+        !$omp end parallel
+    !$omp end parallel    
+    
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel {
+!CHECK: %[[TEMP_1:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP_2:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP_3:.*]] = arith.addi %[[TEMP_1]], %[[TEMP_2]] : i32
+!CHECK: hlfir.assign %[[TEMP_3]] to %[[PRIVATE_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: }
+    !$omp parallel default(private)
+		!$omp parallel default(firstprivate)
+			x = y
+		!$omp end parallel
+
+		!$omp parallel default(shared)
+			w = x + z
+		!$omp end parallel
+	!$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: omp.single {
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: return
+!CHECK: } 
+	!$omp parallel default(firstprivate)
+		!$omp single
+			x = y
+		!$omp end single
+	!$omp end parallel
+end subroutine
+
+!CHECK: func.func @_QPskipped_default_clause_checks() {
+!CHECK: %[[TYPE_ADDR:.*]] = fir.address_of(@_QFskipped_default_clause_checksE.n.i1) : !fir.ref<!fir.char<1,2>>
+!CHECK: %[[VAL_CONST_2:.*]] = arith.constant 2 : index
+!CHECK: %[[VAL_I1_DECLARE:.*]]:2 = hlfir.declare %[[TYPE_ADDR]] typeparams %[[VAL_CONST_2]] {{.*}}
+!CHECK: %[[TYPE_ADDR_IT:.*]] = fir.address_of(@_QFskipped_default_clause_checksE.n.it) : !fir.ref<!fir.char<1,2>>
+!CHECK: %[[VAL_CONST_2_0:.*]] = arith.constant 2 : index
+!CHECK: %[[VAL_IT_DECLARE:.*]]:2 = hlfir.declare %[[TYPE_ADDR_IT]] typeparams %[[VAL_CONST_2_0]] {{.*}}
+!CHECK: %[[VAL_I_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFskipped_default_clause_checksEi"}
+!CHECK: %[[VAL_I_DECLARE:.*]]:2 = hlfir.declare %[[VAL_I_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_III_ALLOCA:.*]] = fir.alloca !fir.type<_QFskipped_default_clause_checksTit{i1:i32}> {bindc_name = "iii", uniq_name = "_QFskipped_default_clause_checksEiii"}
+!CHECK: %[[VAL_III_DECLARE:.*]]:2 = hlfir.declare %[[VAL_III_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_X_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFskipped_default_clause_checksEx"}
+!CHECK: %[[VAL_X_DECLARE:.*]]:2 = hlfir.declare %[[VAL_X_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_Y_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFskipped_default_clause_checksEy"}
+!CHECK: %[[VAL_Y_DECLARE:.*]]:2 = hlfir.declare %[[VAL_Y_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_Z_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFskipped_default_clause_checksEz"}
+!CHECK: %[[VAL_Z_DECLARE:.*]]:2 = hlfir.declare %[[VAL_Z_ALLOCA]] {{.*}}
+subroutine skipped_default_clause_checks()
+       integer :: x,y,z
+       type it
+         integer::i1
+       end type
+       type(it)::iii
+
+!CHECK: omp.parallel {
+!CHECK: omp.wsloop byref reduction(@min_i_32_byref %[[VAL_Z_DECLARE]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) for (%[[ARG:.*]]) {{.*}} {
+!CHECK: omp.yield
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+       !$omp parallel do default(private) REDUCTION(MIN:z)
+         do i = 1, 10
+           x = x + MIN(y,x)
+         enddo
+       !$omp end parallel do
+
+!CHECK: omp.parallel {
+!CHECK: omp.terminator
+!CHECK: }
+       namelist /nam/i
+       !$omp parallel default(private)
+          write(1,nam )
+       !$omp endparallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_III_ALLOCA:.*]] = fir.alloca !fir.type<_QFskipped_default_clause_checksTit{i1:i32}> {{.*}}
+!CHECK: %[[PRIVATE_III_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_III_ALLOCA]] {{.*}}
+!CHECK: %[[PRIVATE_ADDR:.*]] = fir.address_of(@_QQro._QFskipped_default_clause_checksTit.0) : !fir.ref<!fir.type<_QFskipped_default_clause_checksTit{i1:i32}>>
+!CHECK: %[[PRIVATE_PARAM:.*]]:2 = hlfir.declare %[[PRIVATE_ADDR]] {{.*}}
+!CHECK: hlfir.assign %[[PRIVATE_PARAM]]#0 to %[[PRIVATE_III_DECLARE]]#0 {{.*}}
+!CHECK: omp.terminator
+!CHECK: }
+       !$omp parallel default(private)
+          iii=it(11)
+       !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
new file mode 100644
index 00000000000000..067a71340b8dda
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
@@ -0,0 +1,30 @@
+! Test that reductions and delayed privatization work properly togehter. Since
+! both types of clauses add block arguments to the OpenMP region, we make sure
+! that the block arguments are added in the proper order (reductions first and
+! then delayed privatization.
+
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine red_and_delayed_private
+    integer :: red
+    integer :: prv
+
+    red = 0
+    prv = 10
+
+    !$omp parallel reduction(+:red) private(prv)
+    red = red + 1
+    prv = 20
+    !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+
+! CHECK-LABEL: omp.reduction.declare
+! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> init
+
+! CHECK-LABEL: _QPred_and_delayed_private
+! CHECK: omp.parallel
+! CHECK-SAME: reduction(@[[REDUCTION_SYM]] %{{.*}} -> %arg0 : !fir.ref<i32>)
+! CHECK-SAME: private(@[[PRIVATIZER_SYM]] %{{.*}} -> %arg1 : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90
new file mode 100644
index 00000000000000..c4a4695b8d9fc5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90
@@ -0,0 +1,125 @@
+! RUN: bbc -emit-hlfir --force-byref-reduction -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_F32_NAME:.*]] : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  %[[REF:.*]] = fir.alloca f32
+!CHECKL  fir.store [[%C0_1]] to %[[REF]] : !fir.ref<f32>
+!CHECK:  omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:  %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<f32>)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_I32_NAME:.*]] : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
+!CHECK:  %[[REF:.*]] = fir.alloca i32
+!CHECK:  fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+!CHECK:  omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:  %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
+!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<i32>)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_int_add
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
+!CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFsimple_int_addEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
+!CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) {
+!CHECK:    %[[P_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[LPRV:.+]] = fir.load %[[P_DECL]]#0 : !fir.ref<i32>
+!CHECK:    %[[I_INCR:.*]] = arith.constant 1 : i32
+!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]] : i32
+!CHECK:    hlfir.assign %[[RES]] to %[[P_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine simple_int_add
+    integer :: i
+    i = 0
+
+    !$omp parallel reduction(+:i)
+    i = i + 1
+    !$omp end parallel
+
+    print *, i
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_real_add
+!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
+!CHECK:  %[[R_DECL:.*]]:2 = hlfir.declare %[[RREF]] {uniq_name = "_QFsimple_real_addEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:  omp.parallel byref reduction(@[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[P_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:    %[[LPRV:.+]] = fir.load %[[P_DECL]]#0 : !fir.ref<f32>
+!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
+!CHECK:    %[[RES:.+]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
+!CHECK:    hlfir.assign %[[RES]] to %[[P_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine simple_real_add
+    real :: r
+    r = 0.0
+
+    !$omp parallel reduction(+:r)
+    r = r + 1.5
+    !$omp end parallel
+
+    print *, r
+end subroutine
+
+!CHECK-LABEL: func.func @_QPint_real_add
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFint_real_addEi"}
+!CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFint_real_addEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFint_real_addEr"}
+!CHECK:  %[[R_DECL:.*]]:2 = hlfir.declare %[[RREF]] {uniq_name = "_QFint_real_addEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
+!CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[IPRV:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[RPRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[IP_DECL:.+]]:2 = hlfir.declare %[[IPRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[RP_DECL:.+]]:2 = hlfir.declare %[[RPRV]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
+!CHECK:    %[[R_LPRV:.+]] = fir.load %[[RP_DECL]]#0 : !fir.ref<f32>
+!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[R_LPRV]] {{.*}} : f32
+!CHECK:    hlfir.assign %[[RES1]] to %[[RP_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    %[[I_LPRV:.+]] = fir.load %[[IP_DECL]]#0 : !fir.ref<i32>
+!CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
+!CHECK:    %[[RES0:.+]] = arith.addi %[[I_LPRV]], %[[I_INCR]] : i32
+!CHECK:    hlfir.assign %[[RES0]] to %[[IP_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine int_real_add
+    real :: r
+    integer :: i
+
+    r = 0.0
+    i = 0
+
+    !$omp parallel reduction(+:i,r)
+    r = 1.5 + r
+    i = i + 3
+    !$omp end parallel
+
+    print *, r
+    print *, i
+end subroutine
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90
new file mode 100644
index 00000000000000..a7c77c52674ee9
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90
@@ -0,0 +1,44 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+!CHECK:  omp.reduction.declare @[[REDUCTION_DECLARE:[_a-z0-9]+]] : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK:  ^bb0(%{{.*}}: !fir.ref<i32>):
+!CHECK:    %[[I0:[_a-z0-9]+]] = arith.constant 0 : i32
+!CHECK:  %[[REF:.*]] = fir.alloca i32
+!CHECKL  fir.store [[%I0]] to %[[REF]] : !fir.ref<i32>
+!CHECK:    omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK:  } combiner {
+!CHECK:  ^bb0(%[[C0:[_a-z0-9]+]]: !fir.ref<i32>, %[[C1:[_a-z0-9]+]]: !fir.ref<i32>):
+!CHECK:    %[[LD0:.*]] = fir.load %[[C0]] : !fir.ref<i32>
+!CHECK:    %[[LD1:.*]] = fir.load %[[C1]] : !fir.ref<i32>
+!CHECK:    %[[CR:[_a-z0-9]+]] = arith.addi %[[LD0]], %[[LD1]] : i32
+!CHECK:    fir.store %[[CR]] to %[[C0]] : !fir.ref<i32>
+!CHECK:    omp.yield(%[[C0]] : !fir.ref<i32>)
+!CHECK:  }
+!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
+!CHECK:    %[[RED_ACCUM_REF:[_a-z0-9]+]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+!CHECK:    %[[RED_ACCUM_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[RED_ACCUM_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C0:[_a-z0-9]+]] = arith.constant 0 : i32
+!CHECK:    hlfir.assign %[[C0]] to %[[RED_ACCUM_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.parallel byref reduction(@[[REDUCTION_DECLARE]] %[[RED_ACCUM_DECL]]#0 -> %[[PRIVATE_RED:[a-z0-9]+]] : !fir.ref<i32>) {
+!CHECK:      %[[PRIVATE_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[PRIVATE_RED]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:      %[[C1:[_a-z0-9]+]] = arith.constant 1 : i32
+!CHECK:      hlfir.assign %[[C1]] to %[[PRIVATE_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.terminator
+!CHECK:    }
+!CHECK:    %[[RED_ACCUM_VAL:[_a-z0-9]+]] = fir.load %[[RED_ACCUM_DECL]]#0 : !fir.ref<i32>
+!CHECK:    {{.*}} = fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[RED_ACCUM_VAL]]) fastmath<contract> : (!fir.ref<i8>, i32) -> i1
+!CHECK:    return
+!CHECK:  }
+
+program mn
+    integer :: i
+    i = 0
+
+    !$omp parallel reduction(+:i)
+      i = 1
+    !$omp end parallel
+
+    print *, i
+end program
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90
new file mode 100644
index 00000000000000..8492a69fed5825
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90
@@ -0,0 +1,16 @@
+! Check that for parallel do, reduction is only processed for the loop
+
+! RUN: bbc -fopenmp --force-byref-reduction -emit-hlfir %s -o - | FileCheck %s
+! RUN: flang-new -fc1 -fopenmp -mmlir --force-byref-reduction -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK: omp.parallel {
+! CHECK: omp.wsloop byref reduction(@add_reduction_i_32
+subroutine sb
+  integer :: x
+  x = 0
+  !$omp parallel do reduction(+:x)
+  do i=1,100
+    x = x + 1
+  end do
+  !$omp end parallel do
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
new file mode 100644
index 00000000000000..e8a04c23f4ea04
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
@@ -0,0 +1,433 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_f_64_byref : !fir.ref<f64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f64>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:            %[[REF:.*]] = fir.alloca f64
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f64>, %[[ARG1:.*]]: !fir.ref<f64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f64>
+! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_64_byref : !fir.ref<i64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i64>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i64
+! CHECK:            %[[REF:.*]] = fir.alloca i64
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i64>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i64>, %[[ARG1:.*]]: !fir.ref<i64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i64>
+! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_f_32_byref : !fir.ref<f32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:            %[[REF:.*]] = fir.alloca f32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f32>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i32
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_int_reduction
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i32) -> f32
+! CHECK:               %[[VAL_16:.*]] = arith.addf %[[VAL_13]], %[[VAL_15]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_16]] to %[[VAL_12]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_real_reduction
+  real :: x
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i32
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_int_reduction_switch_order
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = i + x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> f32
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_16:.*]] = arith.addf %[[VAL_14]], %[[VAL_15]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_16]] to %[[VAL_12]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_real_reduction_switch_order
+  real :: x
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = i + x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_int_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_int_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_10:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<i32>, @add_reduction_i_32_byref %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<i32>, @add_reduction_i_32_byref %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>)  for  (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:               fir.store %[[VAL_19]] to %[[VAL_12]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = arith.addi %[[VAL_23]], %[[VAL_24]] : i32
+! CHECK:               hlfir.assign %[[VAL_25]] to %[[VAL_20]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_26:.*]] = fir.load %[[VAL_21]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_28:.*]] = arith.addi %[[VAL_26]], %[[VAL_27]] : i32
+! CHECK:               hlfir.assign %[[VAL_28]] to %[[VAL_21]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_31:.*]] = arith.addi %[[VAL_29]], %[[VAL_30]] : i32
+! CHECK:               hlfir.assign %[[VAL_31]] to %[[VAL_22]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine multiple_int_reductions_same_type
+  integer :: x,y,z
+  x = 0
+  y = 0
+  z = 0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_real_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_real_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_5]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<f32>, @add_reduction_f_32_byref %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<f32>, @add_reduction_f_32_byref %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>)  for  (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:               fir.store %[[VAL_19]] to %[[VAL_12]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> f32
+! CHECK:               %[[VAL_26:.*]] = arith.addf %[[VAL_23]], %[[VAL_25]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_20]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_21]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i32) -> f32
+! CHECK:               %[[VAL_30:.*]] = arith.addf %[[VAL_27]], %[[VAL_29]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_30]] to %[[VAL_21]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_32:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> f32
+! CHECK:               %[[VAL_34:.*]] = arith.addf %[[VAL_31]], %[[VAL_33]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_34]] to %[[VAL_22]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine multiple_real_reductions_same_type
+  real :: x,y,z
+  x = 0.0
+  y = 0.0
+  z = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions_different_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductions_different_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_reductions_different_typeEw"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_10:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_11:.*]] = arith.constant 0 : i64
+! CHECK:           hlfir.assign %[[VAL_11]] to %[[VAL_7]]#0 : i64, !fir.ref<i64>
+! CHECK:           %[[VAL_12:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_12]] to %[[VAL_9]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_13:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_3]]#0 : f64, !fir.ref<f64>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_17:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_5]]#0 -> %[[VAL_19:.*]] : !fir.ref<i32>, @add_reduction_i_64_byref %[[VAL_7]]#0 -> %[[VAL_20:.*]] : !fir.ref<i64>, @add_reduction_f_32_byref %[[VAL_9]]#0 -> %[[VAL_21:.*]] : !fir.ref<f32>, @add_reduction_f_64_byref %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<f64>)  for  (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
+! CHECK:               fir.store %[[VAL_23]] to %[[VAL_15]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:               %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_21]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_22]] {uniq_name = "_QFmultiple_reductions_different_typeEw"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_24]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_30:.*]] = arith.addi %[[VAL_28]], %[[VAL_29]] : i32
+! CHECK:               hlfir.assign %[[VAL_30]] to %[[VAL_24]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_25]]#0 : !fir.ref<i64>
+! CHECK:               %[[VAL_32:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> i64
+! CHECK:               %[[VAL_34:.*]] = arith.addi %[[VAL_31]], %[[VAL_33]] : i64
+! CHECK:               hlfir.assign %[[VAL_34]] to %[[VAL_25]]#0 : i64, !fir.ref<i64>
+! CHECK:               %[[VAL_35:.*]] = fir.load %[[VAL_26]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_36:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (i32) -> f32
+! CHECK:               %[[VAL_38:.*]] = arith.addf %[[VAL_35]], %[[VAL_37]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_26]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<f64>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> f64
+! CHECK:               %[[VAL_42:.*]] = arith.addf %[[VAL_39]], %[[VAL_41]] fastmath<contract> : f64
+! CHECK:               hlfir.assign %[[VAL_42]] to %[[VAL_27]]#0 : f64, !fir.ref<f64>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine multiple_reductions_different_type
+  integer :: x
+  integer(kind=8) :: y
+  real :: z
+  real(kind=8) :: w
+  x = 0
+  y = 0
+  z = 0.0
+  w = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z,w)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+    w = w + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90
new file mode 100644
index 00000000000000..3739b3ae36ea13
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90
@@ -0,0 +1,58 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction()
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]])
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i32
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+subroutine simple_int_reduction
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
new file mode 100644
index 00000000000000..15a6dde046fee1
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
@@ -0,0 +1,64 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @iand_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant -1 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.andi %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPreduction_iand(
+! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_iandEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@iand_i_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.andi %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               hlfir.assign %[[VAL_20]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+
+
+subroutine reduction_iand(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(iand:x)
+  do i=1, 100
+    x = iand(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
new file mode 100644
index 00000000000000..4e0957219fa5db
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
@@ -0,0 +1,55 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! CHECK-LABEL:   omp.reduction.declare @ieor_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.xori %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+!CHECK-LABEL: @_QPreduction_ieor
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_ieorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+
+
+!CHECK: omp.parallel
+!CHECK: %[[I_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_ieorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.wsloop byref reduction(@ieor_i_32_byref %[[X_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) for
+!CHECK: fir.store %{{.*}} to %[[I_DECL]]#1 : !fir.ref<i32>
+!CHECK: %[[PRV_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[I_64:.*]] = fir.convert %[[I_32]] : (i32) -> i64
+!CHECK: %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_64]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+!CHECK: %[[LPRV:.+]] = fir.load %[[PRV_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: %[[RES:.+]] = arith.xori %[[LPRV]], %[[Y_I]] : i32
+!CHECK: hlfir.assign %[[RES]] to %[[PRV_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_ieor(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(ieor:x)
+  do i=1, 100
+    x = ieor(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
new file mode 100644
index 00000000000000..712edaf95575da
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
@@ -0,0 +1,64 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @ior_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.ori %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPreduction_ior(
+! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_iorEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@ior_i_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]])
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.ori %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               hlfir.assign %[[VAL_20]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+
+
+subroutine reduction_ior(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(ior:x)
+  do i=1, 100
+    x = ior(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
new file mode 100644
index 00000000000000..4162626f926acc
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
@@ -0,0 +1,206 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @and_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant true
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.andi %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> i64
+! CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_18]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.andi %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.and.:x)
+  do i=1, 100
+     x = x .and. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine simple_reduction
+
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
+! CHECK:               %[[VAL_20:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_19]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.andi %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.and.:x)
+  do i=1, 100
+  x = y(i) .and. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_12:.*]] = arith.constant true
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_7]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant true
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_9]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_16:.*]] = arith.constant true
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@and_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @and_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @and_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:               fir.store %[[VAL_26]] to %[[VAL_19]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_32]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_34]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_37:.*]] = arith.andi %[[VAL_35]], %[[VAL_36]] : i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_27]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_28]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> i64
+! CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_41]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_39]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_45:.*]] = fir.convert %[[VAL_43]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_46:.*]] = arith.andi %[[VAL_44]], %[[VAL_45]] : i1
+! CHECK:               %[[VAL_47:.*]] = fir.convert %[[VAL_46]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_47]] to %[[VAL_28]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.load %[[VAL_29]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_49:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (i32) -> i64
+! CHECK:               %[[VAL_51:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_50]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_52:.*]] = fir.load %[[VAL_51]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_53:.*]] = fir.convert %[[VAL_48]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_55:.*]] = arith.andi %[[VAL_53]], %[[VAL_54]] : i1
+! CHECK:               %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_56]] to %[[VAL_29]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.and.:x,y,z)
+  do i=1, 100
+  x = x .and. w(i)
+  y = y .and. w(i)
+  z = z .and. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
new file mode 100644
index 00000000000000..4c159f3a69f9fd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
@@ -0,0 +1,202 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @eqv_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant true
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> i64
+! CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_18]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.cmpi eq, %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x)
+  do i=1, 100
+    x = x .eqv. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
+! CHECK:               %[[VAL_20:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_19]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.cmpi eq, %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x)
+  do i=1, 100
+  x = y(i) .eqv. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_12:.*]] = arith.constant true
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_7]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant true
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_9]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_16:.*]] = arith.constant true
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:               fir.store %[[VAL_26]] to %[[VAL_19]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_32]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_34]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_37:.*]] = arith.cmpi eq, %[[VAL_35]], %[[VAL_36]] : i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_27]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_28]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> i64
+! CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_41]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_39]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_45:.*]] = fir.convert %[[VAL_43]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_46:.*]] = arith.cmpi eq, %[[VAL_44]], %[[VAL_45]] : i1
+! CHECK:               %[[VAL_47:.*]] = fir.convert %[[VAL_46]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_47]] to %[[VAL_28]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.load %[[VAL_29]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_49:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (i32) -> i64
+! CHECK:               %[[VAL_51:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_50]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_52:.*]] = fir.load %[[VAL_51]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_53:.*]] = fir.convert %[[VAL_48]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_55:.*]] = arith.cmpi eq, %[[VAL_53]], %[[VAL_54]] : i1
+! CHECK:               %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_56]] to %[[VAL_29]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x,y,z)
+  do i=1, 100
+  x = x .eqv. w(i)
+  y = y .eqv. w(i)
+  z = z .eqv. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
new file mode 100644
index 00000000000000..cfa8e47d3ca7ba
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
@@ -0,0 +1,207 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @neqv_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant false
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.cmpi ne, %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> i64
+! CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_18]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.cmpi ne, %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x)
+  do i=1, 100
+    x = x .neqv. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
+! CHECK:               %[[VAL_20:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_19]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.cmpi ne, %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x)
+  do i=1, 100
+  x = y(i) .neqv. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_12:.*]] = arith.constant true
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_7]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant true
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_9]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_16:.*]] = arith.constant true
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:               fir.store %[[VAL_26]] to %[[VAL_19]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_32]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_34]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_37:.*]] = arith.cmpi ne, %[[VAL_35]], %[[VAL_36]] : i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_27]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_28]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> i64
+! CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_41]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_39]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_45:.*]] = fir.convert %[[VAL_43]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_46:.*]] = arith.cmpi ne, %[[VAL_44]], %[[VAL_45]] : i1
+! CHECK:               %[[VAL_47:.*]] = fir.convert %[[VAL_46]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_47]] to %[[VAL_28]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.load %[[VAL_29]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_49:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (i32) -> i64
+! CHECK:               %[[VAL_51:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_50]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_52:.*]] = fir.load %[[VAL_51]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_53:.*]] = fir.convert %[[VAL_48]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_55:.*]] = arith.cmpi ne, %[[VAL_53]], %[[VAL_54]] : i1
+! CHECK:               %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_56]] to %[[VAL_29]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+! CHECK:         }
+
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x,y,z)
+  do i=1, 100
+  x = x .neqv. w(i)
+  y = y .neqv. w(i)
+  z = z .neqv. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
new file mode 100644
index 00000000000000..c71ea02f9373fd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
@@ -0,0 +1,204 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @or_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant false
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.ori %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> i64
+! CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_18]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.ori %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.or.:x)
+  do i=1, 100
+    x = x .or. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
+! CHECK:               %[[VAL_20:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_19]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.ori %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.or.:x)
+  do i=1, 100
+  x = y(i) .or. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_12:.*]] = arith.constant true
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_7]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant true
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_9]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_16:.*]] = arith.constant true
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@or_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @or_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @or_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:               fir.store %[[VAL_26]] to %[[VAL_19]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_32]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_34]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_37:.*]] = arith.ori %[[VAL_35]], %[[VAL_36]] : i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_27]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_28]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> i64
+! CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_41]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_39]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_45:.*]] = fir.convert %[[VAL_43]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_46:.*]] = arith.ori %[[VAL_44]], %[[VAL_45]] : i1
+! CHECK:               %[[VAL_47:.*]] = fir.convert %[[VAL_46]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_47]] to %[[VAL_28]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.load %[[VAL_29]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_49:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (i32) -> i64
+! CHECK:               %[[VAL_51:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_50]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_52:.*]] = fir.load %[[VAL_51]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_53:.*]] = fir.convert %[[VAL_48]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_55:.*]] = arith.ori %[[VAL_53]], %[[VAL_54]] : i1
+! CHECK:               %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_56]] to %[[VAL_29]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.or.:x,y,z)
+  do i=1, 100
+  x = x .or. w(i)
+  y = y .or. w(i)
+  z = z .or. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90
new file mode 100644
index 00000000000000..360cd34df2d110
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90
@@ -0,0 +1,20 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+! CHECK: omp.wsloop byref reduction(@max_i_32
+! CHECK: arith.cmpi sgt
+! CHECK: arith.select
+
+module m1
+  intrinsic max
+end module m1
+program main
+  use m1, ren=>max
+  n=0
+  !$omp parallel do reduction(ren:n)
+  do i=1,100
+     n=max(n,i)
+  end do
+  if (n/=100) print *,101
+  print *,'pass'
+end program main
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
new file mode 100644
index 00000000000000..f9bffc269bb8f5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
@@ -0,0 +1,152 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+!CHECK: omp.reduction.declare @max_f_32_byref : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -3.40282347E+38 : f32
+!CHECK:   %[[REF:.*]] = fir.alloca f32
+!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:   %[[RES:.*]] = arith.maximumf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
+
+!CHECK-LABEL: omp.reduction.declare @max_i_32_byref : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32
+!CHECK:   %[[REF:.*]] = fir.alloca i32
+!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:   %[[RES:.*]] = arith.maxsi %[[LD0]], %[[LD1]] : i32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   func.func @_QPreduction_max_int(
+! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_max_intEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@max_i_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi sgt, %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+! CHECK-LABEL:   func.func @_QPreduction_max_real(
+! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_max_realEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@max_f_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<f32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_20:.*]] = arith.cmpf ogt, %[[VAL_18]], %[[VAL_19]] fastmath<contract> : f32
+! CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : f32
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_30:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_32:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_33:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_34:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@max_f_32_byref %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>)  for  (%[[VAL_36:.*]]) : i32 = (%[[VAL_32]]) to (%[[VAL_33]]) inclusive step (%[[VAL_34]]) {
+! CHECK:               fir.store %[[VAL_36]] to %[[VAL_31]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_35]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_39:.*]] = fir.convert %[[VAL_38]] : (i32) -> i64
+! CHECK:               %[[VAL_40:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_39]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_40]] : !fir.ref<f32>
+! CHECK:               %[[VAL_42:.*]] = fir.load %[[VAL_37]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_43:.*]] = arith.cmpf ogt, %[[VAL_41]], %[[VAL_42]] fastmath<contract> : f32
+! CHECK:               fir.if %[[VAL_43]] {
+! CHECK:                 %[[VAL_44:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_45:.*]] = fir.convert %[[VAL_44]] : (i32) -> i64
+! CHECK:                 %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<f32>
+! CHECK:                 hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref<f32>
+! CHECK:               } else {
+! CHECK:               }
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+
+
+subroutine reduction_max_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
+
+subroutine reduction_max_real(y)
+  real :: x, y(:)
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(y(i), x)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    if (y(i) .gt. x) x = y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90
new file mode 100644
index 00000000000000..a296ce47f20f25
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90
@@ -0,0 +1,62 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:  omp.reduction.declare @max_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:   init {
+! CHECK:        ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:          %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32
+! CHECK:          %[[REF:.*]] = fir.alloca i32
+! CHECK:          fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
+! CHECK:          omp.yield(%[[REF]] : !fir.ref<i32>)
+! CHECK:        combiner
+! CHECK:        ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:          %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:          %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:          %[[RES:.*]] = arith.maxsi %[[LD0]], %[[LD1]] : i32
+! CHECK:          fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:          omp.yield(%[[ARG0]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   func.func @_QPreduction_max_int(
+! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_max_intEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@max_i_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi sgt, %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+
+subroutine reduction_max_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
new file mode 100644
index 00000000000000..da9e686362b668
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
@@ -0,0 +1,154 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+!CHECK: omp.reduction.declare @min_f_32_byref : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 3.40282347E+38 : f32
+!CHECK:   %[[REF:.*]] = fir.alloca f32
+!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:   %[[RES:.*]] = arith.minimumf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
+
+!CHECK-LABEL: omp.reduction.declare @min_i_32_byref : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 2147483647 : i32
+!CHECK:   %[[REF:.*]] = fir.alloca i32
+!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:   %[[RES:.*]] = arith.minsi %[[LD0]], %[[LD1]] : i32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   func.func @_QPreduction_min_int(
+! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_min_intEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@min_i_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi slt, %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+! CHECK-LABEL:   func.func @_QPreduction_min_real(
+! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_min_realEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@min_f_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<f32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_20:.*]] = arith.cmpf olt, %[[VAL_18]], %[[VAL_19]] fastmath<contract> : f32
+! CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : f32
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_30:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_32:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_33:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_34:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@min_f_32_byref %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>)  for  (%[[VAL_36:.*]]) : i32 = (%[[VAL_32]]) to (%[[VAL_33]]) inclusive step (%[[VAL_34]]) {
+! CHECK:               fir.store %[[VAL_36]] to %[[VAL_31]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_35]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_39:.*]] = fir.convert %[[VAL_38]] : (i32) -> i64
+! CHECK:               %[[VAL_40:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_39]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_40]] : !fir.ref<f32>
+! CHECK:               %[[VAL_42:.*]] = fir.load %[[VAL_37]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_43:.*]] = arith.cmpf ogt, %[[VAL_41]], %[[VAL_42]] fastmath<contract> : f32
+! CHECK:               fir.if %[[VAL_43]] {
+! CHECK:                 %[[VAL_44:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_45:.*]] = fir.convert %[[VAL_44]] : (i32) -> i64
+! CHECK:                 %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<f32>
+! CHECK:                 hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref<f32>
+! CHECK:               } else {
+! CHECK:               }
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+
+
+subroutine reduction_min_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    x = min(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
+
+subroutine reduction_min_real(y)
+  real :: x, y(:)
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    x = min(y(i), x)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    if (y(i) .gt. x) x = y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
new file mode 100644
index 00000000000000..00854281b87eac
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
@@ -0,0 +1,414 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @multiply_reduction_f_64_byref : !fir.ref<f64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f64>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f64
+! CHECK:           %[[REF:.*]] = fir.alloca f64
+! CHECK:           fir.store %[[VAL_1]] to %[[REF]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f64>, %[[ARG1:.*]]: !fir.ref<f64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f64>
+! CHECK:           %[[RES:.*]] = arith.mulf %[[LD0]], %[[LD1]] fastmath<contract> : f64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @multiply_reduction_i_64_byref : !fir.ref<i64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i64>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+! CHECK:           %[[REF:.*]] = fir.alloca i64
+! CHECK:           fir.store %[[VAL_1]] to %[[REF]] : !fir.ref<i64>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:          ^bb0(%[[ARG0:.*]]: !fir.ref<i64>, %[[ARG1:.*]]: !fir.ref<i64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i64>
+! CHECK:           %[[RES:.*]] = arith.muli %[[LD0]], %[[LD1]] : i64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @multiply_reduction_f_32_byref : !fir.ref<f32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           %[[REF:.*]] = fir.alloca f32
+! CHECK:           fir.store %[[VAL_1]] to %[[REF]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+! CHECK:           %[[RES:.*]] = arith.mulf %[[LD0]], %[[LD1]] fastmath<contract> : f32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f32>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @multiply_reduction_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+! CHECK:           %[[REF:.*]] = fir.alloca i32
+! CHECK:           fir.store %[[VAL_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.muli %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.muli %[[VAL_13]], %[[VAL_14]] : i32
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_int_reduction
+  integer :: x
+  x = 1
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+    x = x * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i32) -> f32
+! CHECK:               %[[VAL_16:.*]] = arith.mulf %[[VAL_13]], %[[VAL_15]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_16]] to %[[VAL_12]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_real_reduction
+  real :: x
+  x = 1.0
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+    x = x * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.muli %[[VAL_13]], %[[VAL_14]] : i32
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_int_reduction_switch_order
+  integer :: x
+  x = 1
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+  x = i * x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> f32
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_16:.*]] = arith.mulf %[[VAL_14]], %[[VAL_15]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_16]] to %[[VAL_12]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_real_reduction_switch_order
+  real :: x
+  x = 1.0
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+  x = i * x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_int_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_int_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<i32>, @multiply_reduction_i_32_byref %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<i32>, @multiply_reduction_i_32_byref %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>)  for  (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:               fir.store %[[VAL_19]] to %[[VAL_12]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = arith.muli %[[VAL_23]], %[[VAL_24]] : i32
+! CHECK:               hlfir.assign %[[VAL_25]] to %[[VAL_20]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_26:.*]] = fir.load %[[VAL_21]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_28:.*]] = arith.muli %[[VAL_26]], %[[VAL_27]] : i32
+! CHECK:               hlfir.assign %[[VAL_28]] to %[[VAL_21]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_31:.*]] = arith.muli %[[VAL_29]], %[[VAL_30]] : i32
+! CHECK:               hlfir.assign %[[VAL_31]] to %[[VAL_22]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine multiple_int_reductions_same_type
+  integer :: x,y,z
+  x = 1
+  y = 1
+  z = 1
+  !$omp parallel
+  !$omp do reduction(*:x,y,z)
+  do i=1, 10
+  x = x * i
+  y = y * i
+  z = z * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_real_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_real_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_9:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_5]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<f32>, @multiply_reduction_f_32_byref %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<f32>, @multiply_reduction_f_32_byref %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>)  for  (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:               fir.store %[[VAL_19]] to %[[VAL_12]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> f32
+! CHECK:               %[[VAL_26:.*]] = arith.mulf %[[VAL_23]], %[[VAL_25]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_20]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_21]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i32) -> f32
+! CHECK:               %[[VAL_30:.*]] = arith.mulf %[[VAL_27]], %[[VAL_29]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_30]] to %[[VAL_21]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_32:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> f32
+! CHECK:               %[[VAL_34:.*]] = arith.mulf %[[VAL_31]], %[[VAL_33]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_34]] to %[[VAL_22]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine multiple_real_reductions_same_type
+  real :: x,y,z
+  x = 1
+  y = 1
+  z = 1
+  !$omp parallel
+  !$omp do reduction(*:x,y,z)
+  do i=1, 10
+    x = x * i
+    y = y * i
+    z = z * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions_different_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductions_different_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_reductions_different_typeEw"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_11:.*]] = arith.constant 1 : i64
+! CHECK:           hlfir.assign %[[VAL_11]] to %[[VAL_7]]#0 : i64, !fir.ref<i64>
+! CHECK:           %[[VAL_12:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_12]] to %[[VAL_9]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_13:.*]] = arith.constant 1.000000e+00 : f64
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_3]]#0 : f64, !fir.ref<f64>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_17:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_i_32_byref %[[VAL_5]]#0 -> %[[VAL_19:.*]] : !fir.ref<i32>, @multiply_reduction_i_64_byref %[[VAL_7]]#0 -> %[[VAL_20:.*]] : !fir.ref<i64>, @multiply_reduction_f_32_byref %[[VAL_9]]#0 -> %[[VAL_21:.*]] : !fir.ref<f32>, @multiply_reduction_f_64_byref %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<f64>)  for  (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
+! CHECK:               fir.store %[[VAL_23]] to %[[VAL_15]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:               %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_21]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_22]] {uniq_name = "_QFmultiple_reductions_different_typeEw"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_24]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_30:.*]] = arith.muli %[[VAL_28]], %[[VAL_29]] : i32
+! CHECK:               hlfir.assign %[[VAL_30]] to %[[VAL_24]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_25]]#0 : !fir.ref<i64>
+! CHECK:               %[[VAL_32:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> i64
+! CHECK:               %[[VAL_34:.*]] = arith.muli %[[VAL_31]], %[[VAL_33]] : i64
+! CHECK:               hlfir.assign %[[VAL_34]] to %[[VAL_25]]#0 : i64, !fir.ref<i64>
+! CHECK:               %[[VAL_35:.*]] = fir.load %[[VAL_26]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_36:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (i32) -> f32
+! CHECK:               %[[VAL_38:.*]] = arith.mulf %[[VAL_35]], %[[VAL_37]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_26]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<f64>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> f64
+! CHECK:               %[[VAL_42:.*]] = arith.mulf %[[VAL_39]], %[[VAL_41]] fastmath<contract> : f64
+! CHECK:               hlfir.assign %[[VAL_42]] to %[[VAL_27]]#0 : f64, !fir.ref<f64>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+subroutine multiple_reductions_different_type
+  integer :: x
+  integer(kind=8) :: y
+  real :: z
+  real(kind=8) :: w
+  x = 1
+  y = 1
+  z = 1
+  w = 1
+  !$omp parallel
+  !$omp do reduction(*:x,y,z,w)
+  do i=1, 10
+    x = x * i
+    y = y * i
+    z = z * i
+    w = w * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5bbaa8c208b8cd..c9ee0c25194c23 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1339,10 +1339,12 @@ class OpenMPIRBuilder {
   ///                           in reductions.
   /// \param ReductionInfos     A list of info on each reduction variable.
   /// \param IsNoWait           A flag set if the reduction is marked as nowait.
+  /// \param IsByRef            A flag set if the reduction is using reference
+  /// or direct value.
   InsertPointTy createReductions(const LocationDescription &Loc,
                                  InsertPointTy AllocaIP,
                                  ArrayRef<ReductionInfo> ReductionInfos,
-                                 bool IsNoWait = false);
+                                 bool IsNoWait = false, bool IsByRef = false);
 
   ///}
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index d65ed8c11d86cc..c74c898e1e882d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2110,7 +2110,7 @@ Function *getFreshReductionFunc(Module &M) {
 
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
-    ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait) {
+    ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef) {
   for (const ReductionInfo &RI : ReductionInfos) {
     (void)RI;
     assert(RI.Variable && "expected non-null variable");
@@ -2197,17 +2197,29 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   for (auto En : enumerate(ReductionInfos)) {
     const ReductionInfo &RI = En.value();
     Type *ValueType = RI.ElementType;
-    Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable,
-                                         "red.value." + Twine(En.index()));
+    // We have one less load for by-ref case because that load is now inside of
+    // the reduction region
+    Value *RedValue = nullptr;
+    if (!IsByRef) {
+      RedValue = Builder.CreateLoad(ValueType, RI.Variable,
+                                    "red.value." + Twine(En.index()));
+    }
     Value *PrivateRedValue =
         Builder.CreateLoad(ValueType, RI.PrivateVariable,
                            "red.private.value." + Twine(En.index()));
     Value *Reduced;
-    Builder.restoreIP(
-        RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced));
+    if (IsByRef) {
+      Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), RI.Variable,
+                                        PrivateRedValue, Reduced));
+    } else {
+      Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), RedValue,
+                                        PrivateRedValue, Reduced));
+    }
     if (!Builder.GetInsertBlock())
       return InsertPointTy();
-    Builder.CreateStore(Reduced, RI.Variable);
+    // for by-ref case, the load is inside of the reduction region
+    if (!IsByRef)
+      Builder.CreateStore(Reduced, RI.Variable);
   }
   Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
       IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
@@ -2219,7 +2231,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   // function. There are no loads/stores here because they will be happening
   // inside the atomic elementwise reduction.
   Builder.SetInsertPoint(AtomicRedBlock);
-  if (CanGenerateAtomic) {
+  if (CanGenerateAtomic && !IsByRef) {
     for (const ReductionInfo &RI : ReductionInfos) {
       Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.ElementType,
                                               RI.Variable, RI.PrivateVariable));
@@ -2257,7 +2269,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
     Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
     if (!Builder.GetInsertBlock())
       return InsertPointTy();
-    Builder.CreateStore(Reduced, LHSPtr);
+    // store is inside of the reduction region when using by-ref
+    if (!IsByRef)
+      Builder.CreateStore(Reduced, LHSPtr);
   }
   Builder.CreateRetVoid();
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 39ff49f0ccf541..0bd402d626dc42 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -268,6 +268,9 @@ def ParallelOp : OpenMP_Op<"parallel", [
 
     The optional $proc_bind_val attribute controls the thread affinity for the execution
     of the parallel region.
+
+    The optional byref attribute controls whether reduction arguments are passed by
+    reference or by value.
   }];
 
   let arguments = (ins Optional<I1>:$if_expr_var,
@@ -278,7 +281,8 @@ def ParallelOp : OpenMP_Op<"parallel", [
              OptionalAttr<SymbolRefArrayAttr>:$reductions,
              OptionalAttr<ProcBindKindAttr>:$proc_bind_val,
              Variadic<AnyType>:$private_vars,
-             OptionalAttr<SymbolRefArrayAttr>:$privatizers);
+             OptionalAttr<SymbolRefArrayAttr>:$privatizers,
+             UnitAttr:$byref);
 
   let regions = (region AnyRegion:$region);
 
@@ -299,6 +303,7 @@ def ParallelOp : OpenMP_Op<"parallel", [
                 $allocators_vars, type($allocators_vars)
               ) `)`
           | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+          | `byref` $byref
     ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars),
                              $reductions, $private_vars, type($private_vars),
                              $privatizers) attr-dict
@@ -570,6 +575,9 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
     The optional `order` attribute specifies which order the iterations of the
     associate loops are executed in. Currently the only option for this
     attribute is "concurrent".
+    
+    The optional `byref` attribute indicates that reduction arguments should be
+    passed by reference.
   }];
 
   let arguments = (ins Variadic<IntLikeType>:$lowerBound,
@@ -584,6 +592,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
              OptionalAttr<ScheduleModifierAttr>:$schedule_modifier,
              UnitAttr:$simd_modifier,
              UnitAttr:$nowait,
+             UnitAttr:$byref,
              ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered_val,
              OptionalAttr<OrderKindAttr>:$order_val,
              UnitAttr:$inclusive);
@@ -613,6 +622,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
                 $schedule_val, $schedule_modifier, $simd_modifier,
                 $schedule_chunk_var, type($schedule_chunk_var)) `)`
           |`nowait` $nowait
+          |`byref` $byref
           |`ordered` `(` $ordered_val `)`
           |`order` `(` custom<ClauseAttr>($order_val) `)`
     ) custom<WsLoop>($region, $lowerBound, $upperBound, $step,
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 8a6980e2c6a2d9..e7b899aec4afbf 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1209,7 +1209,7 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
       /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
       /*reduction_vars=*/ValueRange(), /*reductions=*/nullptr,
       /*proc_bind_val=*/nullptr, /*private_vars=*/ValueRange(),
-      /*privatizers=*/nullptr);
+      /*privatizers=*/nullptr, /*byref=*/false);
   state.addAttributes(attributes);
 }
 
@@ -1674,7 +1674,8 @@ void WsLoopOp::build(OpBuilder &builder, OperationState &state,
         /*linear_step_vars=*/ValueRange(), /*reduction_vars=*/ValueRange(),
         /*reductions=*/nullptr, /*schedule_val=*/nullptr,
         /*schedule_chunk_var=*/nullptr, /*schedule_modifier=*/nullptr,
-        /*simd_modifier=*/false, /*nowait=*/false, /*ordered_val=*/nullptr,
+        /*simd_modifier=*/false, /*nowait=*/false, /*byref=*/false,
+        /*ordered_val=*/nullptr,
         /*order_val=*/nullptr, /*inclusive=*/false);
   state.addAttributes(attributes);
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index bef227f2c583ed..5027f2afe92154 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -805,12 +805,12 @@ convertOmpTaskgroupOp(omp::TaskGroupOp tgOp, llvm::IRBuilderBase &builder,
 /// Allocate space for privatized reduction variables.
 template <typename T>
 static void
-allocReductionVars(T loop, llvm::IRBuilderBase &builder,
-                   LLVM::ModuleTranslation &moduleTranslation,
-                   llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
-                   SmallVector<omp::ReductionDeclareOp> &reductionDecls,
-                   SmallVector<llvm::Value *> &privateReductionVariables,
-                   DenseMap<Value, llvm::Value *> &reductionVariableMap) {
+allocByValReductionVars(T loop, llvm::IRBuilderBase &builder,
+                        LLVM::ModuleTranslation &moduleTranslation,
+                        llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+                        SmallVector<omp::ReductionDeclareOp> &reductionDecls,
+                        SmallVector<llvm::Value *> &privateReductionVariables,
+                        DenseMap<Value, llvm::Value *> &reductionVariableMap) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.restoreIP(allocaIP);
   auto args =
@@ -863,6 +863,7 @@ static LogicalResult
 convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
   auto loop = cast<omp::WsLoopOp>(opInst);
+  const bool isByRef = loop.getByref();
   // TODO: this should be in the op verifier instead.
   if (loop.getLowerBound().empty())
     return failure();
@@ -888,18 +889,17 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
 
   SmallVector<llvm::Value *> privateReductionVariables;
   DenseMap<Value, llvm::Value *> reductionVariableMap;
-  allocReductionVars(loop, builder, moduleTranslation, allocaIP, reductionDecls,
-                     privateReductionVariables, reductionVariableMap);
-
-  // Store the mapping between reduction variables and their private copies on
-  // ModuleTranslation stack. It can be then recovered when translating
-  // omp.reduce operations in a separate call.
-  LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
-      moduleTranslation, reductionVariableMap);
+  if (!isByRef) {
+    allocByValReductionVars(loop, builder, moduleTranslation, allocaIP,
+                            reductionDecls, privateReductionVariables,
+                            reductionVariableMap);
+  }
 
   // Before the loop, store the initial values of reductions into reduction
   // variables. Although this could be done after allocas, we don't want to mess
   // up with the alloca insertion point.
+  MutableArrayRef<BlockArgument> reductionArgs =
+      loop.getRegion().getArguments().take_back(loop.getNumReductionVars());
   for (unsigned i = 0; i < loop.getNumReductionVars(); ++i) {
     SmallVector<llvm::Value *> phis;
     if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
@@ -908,9 +908,31 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
       return failure();
     assert(phis.size() == 1 && "expected one value to be yielded from the "
                                "reduction neutral element declaration region");
-    builder.CreateStore(phis[0], privateReductionVariables[i]);
+    if (isByRef) {
+      // Allocate reduction variable (which is a pointer to the real reduction
+      // variable allocated in the inlined region)
+      llvm::Value *var = builder.CreateAlloca(
+          moduleTranslation.convertType(reductionDecls[i].getType()));
+      // Store the result of the inlined region to the allocated reduction var
+      // ptr
+      builder.CreateStore(phis[0], var);
+
+      privateReductionVariables.push_back(var);
+      moduleTranslation.mapValue(reductionArgs[i], phis[0]);
+      reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]);
+    } else {
+      // for by-ref case the store is inside of the reduction region
+      builder.CreateStore(phis[0], privateReductionVariables[i]);
+      // the rest was handled in allocByValReductionVars
+    }
   }
 
+  // Store the mapping between reduction variables and their private copies on
+  // ModuleTranslation stack. It can be then recovered when translating
+  // omp.reduce operations in a separate call.
+  LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
+      moduleTranslation, reductionVariableMap);
+
   // Set up the source location value for OpenMP runtime.
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
@@ -1014,7 +1036,7 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   builder.SetInsertPoint(tempTerminator);
   llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
       ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos,
-                                   loop.getNowait());
+                                   loop.getNowait(), isByRef);
   if (!contInsertPoint.getBlock())
     return loop->emitOpError() << "failed to convert reductions";
   auto nextInsertionPoint =
@@ -1068,6 +1090,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   OmpParallelOpConversionManager raii(opInst);
+  const bool isByRef = opInst.getByref();
 
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
@@ -1082,18 +1105,17 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     // Allocate reduction vars
     SmallVector<llvm::Value *> privateReductionVariables;
     DenseMap<Value, llvm::Value *> reductionVariableMap;
-    allocReductionVars(opInst, builder, moduleTranslation, allocaIP,
-                       reductionDecls, privateReductionVariables,
-                       reductionVariableMap);
-
-    // Store the mapping between reduction variables and their private copies on
-    // ModuleTranslation stack. It can be then recovered when translating
-    // omp.reduce operations in a separate call.
-    LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
-        moduleTranslation, reductionVariableMap);
+    if (!isByRef) {
+      allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP,
+                              reductionDecls, privateReductionVariables,
+                              reductionVariableMap);
+    }
 
     // Initialize reduction vars
     builder.restoreIP(allocaIP);
+    MutableArrayRef<BlockArgument> reductionArgs =
+        opInst.getRegion().getArguments().take_back(
+            opInst.getNumReductionVars());
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
       if (failed(inlineConvertOmpRegions(
@@ -1104,9 +1126,32 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
              "expected one value to be yielded from the "
              "reduction neutral element declaration region");
       builder.restoreIP(allocaIP);
-      builder.CreateStore(phis[0], privateReductionVariables[i]);
+
+      if (isByRef) {
+        // Allocate reduction variable (which is a pointer to the real reduciton
+        // variable allocated in the inlined region)
+        llvm::Value *var = builder.CreateAlloca(
+            moduleTranslation.convertType(reductionDecls[i].getType()));
+        // Store the result of the inlined region to the allocated reduction var
+        // ptr
+        builder.CreateStore(phis[0], var);
+
+        privateReductionVariables.push_back(var);
+        moduleTranslation.mapValue(reductionArgs[i], phis[0]);
+        reductionVariableMap.try_emplace(opInst.getReductionVars()[i], phis[0]);
+      } else {
+        // for by-ref case the store is inside of the reduction init region
+        builder.CreateStore(phis[0], privateReductionVariables[i]);
+        // the rest is done in allocByValReductionVars
+      }
     }
 
+    // Store the mapping between reduction variables and their private copies on
+    // ModuleTranslation stack. It can be then recovered when translating
+    // omp.reduce operations in a separate call.
+    LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
+        moduleTranslation, reductionVariableMap);
+
     // Save the alloca insertion point on ModuleTranslation stack for use in
     // nested regions.
     LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
@@ -1137,7 +1182,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
       llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
           ompBuilder->createReductions(builder.saveIP(), allocaIP,
-                                       reductionInfos, false);
+                                       reductionInfos, false, isByRef);
       if (!contInsertPoint.getBlock()) {
         bodyGenStatus = opInst->emitOpError() << "failed to convert reductions";
         return;
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir
new file mode 100644
index 00000000000000..4ac1ebd43e1ee0
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir
@@ -0,0 +1,66 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+  omp.reduction.declare @add_reduction_i_32 : !llvm.ptr init {
+  ^bb0(%arg0: !llvm.ptr):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.mlir.constant(1 : i64) : i64
+    %2 = llvm.alloca %1 x i32 : (i64) -> !llvm.ptr
+    llvm.store %0, %2 : i32, !llvm.ptr
+    omp.yield(%2 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.load %arg0 : !llvm.ptr -> i32
+    %1 = llvm.load %arg1 : !llvm.ptr -> i32
+    %2 = llvm.add %0, %1  : i32
+    llvm.store %2, %arg0 : i32, !llvm.ptr
+    omp.yield(%arg0 : !llvm.ptr)
+  } 
+
+  // CHECK-LABEL: @main
+  llvm.func @main()  {
+    %0 = llvm.mlir.constant(-1 : i32) : i32
+    %1 = llvm.mlir.addressof @i : !llvm.ptr
+    omp.parallel byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr) {
+      llvm.store %0, %arg0 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+
+// CHECK: %{{.+}} = 
+// Call to the outlined function.
+// CHECK: call void {{.*}} @__kmpc_fork_call
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %tid.addr.local = alloca i32
+// CHECK: %[[PRIVATE:.+]] = alloca i32
+// CHECK: store i32 0, ptr %[[PRIVATE]]
+// CHECK: store ptr %[[PRIVATE]], ptr %[[PRIV_PTR:.+]],
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+
+// Non-atomic reduction:
+// CHECK: %[[PRIV_VAL_PTR:.+]] = load ptr, ptr %[[PRIV_PTR]]
+// CHECK: %[[LOAD:.+]] = load i32, ptr @i
+// CHECK: %[[PRIV_VAL:.+]] = load i32, ptr %[[PRIV_VAL_PTR]]
+// CHECK: %[[SUM:.+]] = add i32 %[[LOAD]], %[[PRIV_VAL]]
+// CHECK: store i32 %[[SUM]], ptr @i
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: add i32

>From 3114a1239e78ed168b03ddfc456a4fa441836826 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Tue, 12 Mar 2024 16:20:01 +0000
Subject: [PATCH 4/7] Fix regression on min reduction

This makes sure that the generated IR doesn't change as a result of this
PR. The generated IR looks wrong to me (no reduction is generated at
all), but that is a matter for another patch.
---
 flang/lib/Lower/OpenMP/ReductionProcessor.cpp | 13 ++++++
 .../Lower/OpenMP/wsloop-reduction-min2.f90    | 41 +++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-min2.f90

diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index 34959e95dce04a..a88f863ddd3d42 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -371,6 +371,19 @@ void ReductionProcessor::addReductionDecl(
       std::get<Fortran::parser::OmpReductionOperator>(reduction.t)};
   const auto &objectList{std::get<Fortran::parser::OmpObjectList>(reduction.t)};
 
+  if (!std::holds_alternative<Fortran::parser::DefinedOperator>(
+          redOperator.u)) {
+    if (const auto *reductionIntrinsic =
+            std::get_if<Fortran::parser::ProcedureDesignator>(&redOperator.u)) {
+      if (!ReductionProcessor::supportedIntrinsicProcReduction(
+              *reductionIntrinsic)) {
+        return;
+      }
+    } else {
+      return;
+    }
+  }
+
   // initial pass to collect all recuction vars so we can figure out if this
   // should happen byref
   for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
new file mode 100644
index 00000000000000..9289973bae2029
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
@@ -0,0 +1,41 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+! regression test for crash
+
+program reduce
+integer :: i = 0
+integer :: r = 0
+
+!$omp parallel do reduction(min:r)
+do i=0,10
+   r = i
+enddo
+!$omp end parallel do
+
+print *,r
+
+end program
+
+! TODO: the reduction is not curently lowered correctly. This test is checking
+! that we do not crash and we still produce the same broken IR as before.
+
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<i32>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_4:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop  for  (%[[VAL_9:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_9]] to %[[VAL_5]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:               hlfir.assign %[[VAL_10]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }

>From 85df0051a5cb785fe491b9876c66e5b5f36392f3 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Tue, 12 Mar 2024 16:24:36 +0000
Subject: [PATCH 5/7] Add TODO message

---
 flang/lib/Lower/OpenMP/ReductionProcessor.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index a88f863ddd3d42..e6a63dd4b939ce 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -339,6 +339,10 @@ mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
   return decl;
 }
 
+// TODO: By-ref vs by-val reductions are currently toggled for the whole
+//       operation (possibly effecting multiple reduction variables).
+//       This could cause a problem with openmp target reductions because
+//       by-ref trivial types may not be supported.
 bool ReductionProcessor::doReductionByRef(
     const llvm::SmallVectorImpl<mlir::Value> &reductionVars) {
   if (reductionVars.empty())

>From 63c7fe3c522047b4df964a137ffdf0bbc38acec8 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 14 Feb 2024 15:22:02 +0000
Subject: [PATCH 6/7] [flang] support fir.alloca operations inside of omp
 reduction ops

Advise to place the alloca at the start of the first block of whichever
region (init or combiner) we are currently inside.

It probably isn't safe to put an alloca inside of a combiner region
because this will be executed multiple times. But that would be a bug to
fix in Lower/OpenMP.cpp, not here.
---
 flang/lib/Optimizer/Builder/FIRBuilder.cpp |  2 ++
 flang/lib/Optimizer/CodeGen/CodeGen.cpp    | 11 +++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 12da7412888a3b..f7327a299d9a5e 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -208,6 +208,8 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
               .getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
     return ompOutlineableIface.getAllocaBlock();
   }
+  if (mlir::isa<mlir::omp::ReductionDeclareOp>(getRegion().getParentOp()))
+    return &getRegion().front();
   if (auto accRecipeIface =
           getRegion().getParentOfType<mlir::acc::RecipeInterface>()) {
     return accRecipeIface.getAllocaBlock(getRegion());
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index f81a08388da722..123eb6e4e6a255 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -410,8 +410,15 @@ class FIROpConversion : public mlir::ConvertOpToLLVMPattern<FromOp> {
       mlir::ConversionPatternRewriter &rewriter) const {
     auto thisPt = rewriter.saveInsertionPoint();
     mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
-    mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp);
-    rewriter.setInsertionPointToStart(insertBlock);
+    if (mlir::isa<mlir::omp::ReductionDeclareOp>(parentOp)) {
+      // ReductionDeclareOp has multiple child regions. We want to get the first
+      // block of whichever of those regions we are currently in
+      mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
+      rewriter.setInsertionPointToStart(&parentRegion->front());
+    } else {
+      mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp);
+      rewriter.setInsertionPointToStart(insertBlock);
+    }
     auto size = genI32Constant(loc, rewriter, 1);
     unsigned allocaAs = getAllocaAddressSpace(rewriter);
     unsigned programAs = getProgramAddressSpace(rewriter);

>From 636566e561a3c065c07793db9c972b477a729f53 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Thu, 14 Mar 2024 16:19:08 +0000
Subject: [PATCH 7/7] Add test

---
 .../test/Fir/omp-reduction-embox-codegen.fir  | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 flang/test/Fir/omp-reduction-embox-codegen.fir

diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir
new file mode 100644
index 00000000000000..24bde536667b50
--- /dev/null
+++ b/flang/test/Fir/omp-reduction-embox-codegen.fir
@@ -0,0 +1,36 @@
+// RUN: tco %s | FileCheck %s
+
+// the fir.embox in the init region is turned into an alloca for the box. Test
+// that CodeGen.cpp knows where to place an alloca when it is inside of an
+// omp.reduction.declare
+
+// regretably this has to be nonsense IR because we need the subsequent patches
+// to process anything useful
+
+omp.reduction.declare @test_reduction : !fir.ref<!fir.box<i32>> init {
+^bb0(%arg0: !fir.ref<!fir.box<i32>>):
+  %0 = fir.alloca !fir.box<i32>
+  %1 = fir.alloca i32
+  %2 = fir.embox %1 : (!fir.ref<i32>) -> !fir.box<i32>
+
+  // use the embox for something so it isn't removed
+  fir.store %2 to %0 : !fir.ref<!fir.box<i32>>
+
+  omp.yield(%0 : !fir.ref<!fir.box<i32>>)
+} combiner {
+^bb0(%arg0: !fir.ref<!fir.box<i32>>, %arg1: !fir.ref<!fir.box<i32>>):
+  %0 = fir.undefined !fir.ref<!fir.box<i32>>
+  omp.yield(%0 : !fir.ref<!fir.box<i32>>)
+}
+
+func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+  %4 = fir.alloca !fir.box<i32>
+  omp.parallel byref reduction(@test_reduction %4 -> %arg0 : !fir.ref<!fir.box<i32>>) {
+    omp.terminator
+  }
+  return
+}
+
+// basically we are testing that there isn't a crash
+// CHECK-LABEL: define void @_QQmain
+// CHECK-NEXT:    alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8



More information about the flang-commits mailing list