[flang-commits] [flang] 3ada883 - [flang][runtime] Runtime support for REDUCE() (#86214)

via flang-commits flang-commits at lists.llvm.org
Tue Mar 26 09:21:21 PDT 2024


Author: Peter Klausler
Date: 2024-03-26T09:21:16-07:00
New Revision: 3ada883f7c96e099e1a665c091751bff5f16690e

URL: https://github.com/llvm/llvm-project/commit/3ada883f7c96e099e1a665c091751bff5f16690e
DIFF: https://github.com/llvm/llvm-project/commit/3ada883f7c96e099e1a665c091751bff5f16690e.diff

LOG: [flang][runtime] Runtime support for REDUCE() (#86214)

Supports the REDUCE() transformational intrinsic function of Fortran
(see F'2023 16.9.173) in a manner similar to the existing support for
SUM(), PRODUCT(), &c. There are APIs for total reductions to scalar
results, and APIs for partial reductions that reduce the rank of the
argument by one.

This implementation requires more functions than other reductions
because the various possible types of the user-supplied OPERATION=
function need to be elaborated.

Once the basic API in reduce.h has been approved, later patches will
implement lowering.

REDUCE() is primarily for completeness, not portability; only one other
Fortran compiler implements this F'2018 feature today, and only some
types work correctly with it.

Added: 
    flang/include/flang/Runtime/reduce.h
    flang/runtime/reduce.cpp

Modified: 
    flang/include/flang/Runtime/reduction.h
    flang/lib/Semantics/check-call.cpp
    flang/runtime/CMakeLists.txt
    flang/runtime/complex-reduction.c
    flang/runtime/complex-reduction.h
    flang/runtime/io-api.cpp
    flang/runtime/reduction-templates.h
    flang/runtime/tools.cpp
    flang/runtime/tools.h
    flang/unittests/Runtime/Reduction.cpp

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Runtime/reduce.h b/flang/include/flang/Runtime/reduce.h
new file mode 100644
index 00000000000000..975aa6dea305f5
--- /dev/null
+++ b/flang/include/flang/Runtime/reduce.h
@@ -0,0 +1,257 @@
+//===-- include/flang/Runtime/reduce.h --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Defines the API for implementations of the transformational intrinsic
+// function REDUCE(); see F'2023 16.9.173.
+//
+// Similar to the definition of the APIs for SUM(), &c., in reduction.h,
+// there are typed functions here like ReduceInteger4() for total reductions
+// to scalars and void functions like ReduceInteger4Dim() for partial
+// reductions to smaller arrays.
+
+#ifndef FORTRAN_RUNTIME_REDUCE_H_
+#define FORTRAN_RUNTIME_REDUCE_H_
+
+#include "flang/Common/float128.h"
+#include "flang/Common/uint128.h"
+#include "flang/Runtime/cpp-type.h"
+#include "flang/Runtime/entry-names.h"
+#include <complex>
+#include <cstdint>
+
+namespace Fortran::runtime {
+
+class Descriptor;
+
+template <typename T> using ReductionOperation = T (*)(const T *, const T *);
+template <typename CHAR>
+using ReductionCharOperation = void (*)(CHAR *hiddenResult,
+    std::size_t resultLen, const CHAR *x, const CHAR *y, std::size_t xLen,
+    std::size_t yLen);
+using ReductionDerivedTypeOperation = void (*)(
+    void *hiddenResult, const void *x, const void *y);
+
+extern "C" {
+
+std::int8_t RTDECL(ReduceInteger1)(const Descriptor &,
+    ReductionOperation<std::int8_t>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const std::int8_t *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceInteger1Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int8_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const std::int8_t *identity = nullptr,
+    bool ordered = true);
+std::int16_t RTDECL(ReduceInteger2)(const Descriptor &,
+    ReductionOperation<std::int16_t>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const std::int16_t *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceInteger2Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int16_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const std::int16_t *identity = nullptr,
+    bool ordered = true);
+std::int32_t RTDECL(ReduceInteger4)(const Descriptor &,
+    ReductionOperation<std::int32_t>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const std::int32_t *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceInteger4Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int32_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const std::int32_t *identity = nullptr,
+    bool ordered = true);
+std::int64_t RTDECL(ReduceInteger8)(const Descriptor &,
+    ReductionOperation<std::int64_t>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const std::int64_t *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceInteger8Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int64_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const std::int64_t *identity = nullptr,
+    bool ordered = true);
+#ifdef __SIZEOF_INT128__
+common::int128_t RTDECL(ReduceInteger16)(const Descriptor &,
+    ReductionOperation<common::int128_t>, const char *source, int line,
+    int dim = 0, const Descriptor *mask = nullptr,
+    const common::int128_t *identity = nullptr, bool ordered = true);
+void RTDECL(ReduceInteger16Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<common::int128_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr,
+    const common::int128_t *identity = nullptr, bool ordered = true);
+#endif
+
+// REAL/COMPLEX(2 & 3) return 32-bit float results for the caller to downconvert
+float RTDECL(ReduceReal2)(const Descriptor &, ReductionOperation<float>,
+    const char *source, int line, int dim = 0, const Descriptor *mask = nullptr,
+    const float *identity = nullptr, bool ordered = true);
+void RTDECL(ReduceReal2Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<float>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const float *identity = nullptr,
+    bool ordered = true);
+float RTDECL(ReduceReal3)(const Descriptor &, ReductionOperation<float>,
+    const char *source, int line, int dim = 0, const Descriptor *mask = nullptr,
+    const float *identity = nullptr, bool ordered = true);
+void RTDECL(ReduceReal3Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<float>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const float *identity = nullptr,
+    bool ordered = true);
+float RTDECL(ReduceReal4)(const Descriptor &, ReductionOperation<float>,
+    const char *source, int line, int dim = 0, const Descriptor *mask = nullptr,
+    const float *identity = nullptr, bool ordered = true);
+void RTDECL(ReduceReal4Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<float>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const float *identity = nullptr,
+    bool ordered = true);
+double RTDECL(ReduceReal8)(const Descriptor &, ReductionOperation<double>,
+    const char *source, int line, int dim = 0, const Descriptor *mask = nullptr,
+    const double *identity = nullptr, bool ordered = true);
+void RTDECL(ReduceReal8Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<double>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const double *identity = nullptr,
+    bool ordered = true);
+#if LDBL_MANT_DIG == 64
+long double RTDECL(ReduceReal10)(const Descriptor &,
+    ReductionOperation<long double>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const long double *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceReal10Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<long double>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const long double *identity = nullptr,
+    bool ordered = true);
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppFloat128Type RTDECL(ReduceReal16)(const Descriptor &,
+    ReductionOperation<CppFloat128Type>, const char *source, int line,
+    int dim = 0, const Descriptor *mask = nullptr,
+    const CppFloat128Type *identity = nullptr, bool ordered = true);
+void RTDECL(ReduceReal16Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<CppFloat128Type>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const CppFloat128Type *identity = nullptr,
+    bool ordered = true);
+#endif
+
+void RTDECL(CppReduceComplex2)(std::complex<float> &, const Descriptor &,
+    ReductionOperation<std::complex<float>>, const char *source, int line,
+    int dim = 0, const Descriptor *mask = nullptr,
+    const std::complex<float> *identity = nullptr, bool ordered = true);
+void RTDECL(CppReduceComplex2Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::complex<float>>, const char *source, int line,
+    int dim, const Descriptor *mask = nullptr,
+    const std::complex<float> *identity = nullptr, bool ordered = true);
+void RTDECL(CppReduceComplex3)(std::complex<float> &, const Descriptor &,
+    ReductionOperation<std::complex<float>>, const char *source, int line,
+    int dim = 0, const Descriptor *mask = nullptr,
+    const std::complex<float> *identity = nullptr, bool ordered = true);
+void RTDECL(CppReduceComplex3Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::complex<float>>, const char *source, int line,
+    int dim, const Descriptor *mask = nullptr,
+    const std::complex<float> *identity = nullptr, bool ordered = true);
+void RTDECL(CppReduceComplex4)(std::complex<float> &, const Descriptor &,
+    ReductionOperation<std::complex<float>>, const char *source, int line,
+    int dim = 0, const Descriptor *mask = nullptr,
+    const std::complex<float> *identity = nullptr, bool ordered = true);
+void RTDECL(CppReduceComplex4Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::complex<float>>, const char *source, int line,
+    int dim, const Descriptor *mask = nullptr,
+    const std::complex<float> *identity = nullptr, bool ordered = true);
+void RTDECL(CppReduceComplex8)(std::complex<double> &, const Descriptor &,
+    ReductionOperation<std::complex<double>>, const char *source, int line,
+    int dim = 0, const Descriptor *mask = nullptr,
+    const std::complex<double> *identity = nullptr, bool ordered = true);
+void RTDECL(CppReduceComplex8Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::complex<double>>, const char *source, int line,
+    int dim, const Descriptor *mask = nullptr,
+    const std::complex<double> *identity = nullptr, bool ordered = true);
+#if LDBL_MANT_DIG == 64
+void RTDECL(CppReduceComplex10)(std::complex<long double> &, const Descriptor &,
+    ReductionOperation<std::complex<long double>>, const char *source, int line,
+    int dim = 0, const Descriptor *mask = nullptr,
+    const std::complex<long double> *identity = nullptr, bool ordered = true);
+void RTDECL(CppReduceComplex10Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::complex<long double>>, const char *source, int line,
+    int dim, const Descriptor *mask = nullptr,
+    const std::complex<long double> *identity = nullptr, bool ordered = true);
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDECL(CppReduceComplex16)(std::complex<CppFloat128Type> &,
+    const Descriptor &, ReductionOperation<std::complex<CppFloat128Type>>,
+    const char *source, int line, int dim = 0, const Descriptor *mask = nullptr,
+    const std::complex<CppFloat128Type> *identity = nullptr,
+    bool ordered = true);
+void RTDECL(CppReduceComplex16Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::complex<CppFloat128Type>>, const char *source,
+    int line, int dim, const Descriptor *mask = nullptr,
+    const std::complex<CppFloat128Type> *identity = nullptr,
+    bool ordered = true);
+#endif
+
+bool RTDECL(ReduceLogical1)(const Descriptor &, ReductionOperation<std::int8_t>,
+    const char *source, int line, int dim = 0, const Descriptor *mask = nullptr,
+    const std::int8_t *identity = nullptr, bool ordered = true);
+void RTDECL(ReduceLogical1Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int8_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const std::int8_t *identity = nullptr,
+    bool ordered = true);
+bool RTDECL(ReduceLogical2)(const Descriptor &,
+    ReductionOperation<std::int16_t>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const std::int16_t *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceLogical2Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int16_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const std::int16_t *identity = nullptr,
+    bool ordered = true);
+bool RTDECL(ReduceLogical4)(const Descriptor &,
+    ReductionOperation<std::int32_t>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const std::int32_t *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceLogical4Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int32_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const std::int32_t *identity = nullptr,
+    bool ordered = true);
+bool RTDECL(ReduceLogical8)(const Descriptor &,
+    ReductionOperation<std::int64_t>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const std::int64_t *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceLogical8Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int64_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const std::int64_t *identity = nullptr,
+    bool ordered = true);
+
+void RTDECL(ReduceChar1)(char *result, const Descriptor &array,
+    ReductionCharOperation<char>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const char *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceCharacter1Dim)(Descriptor &result, const Descriptor &array,
+    ReductionCharOperation<char>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const char *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceChar2)(char16_t *result, const Descriptor &array,
+    ReductionCharOperation<char16_t>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const char16_t *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceCharacter2Dim)(Descriptor &result, const Descriptor &array,
+    ReductionCharOperation<char16_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const char16_t *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceChar4)(char32_t *result, const Descriptor &array,
+    ReductionCharOperation<char32_t>, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const char32_t *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceCharacter4Dim)(Descriptor &result, const Descriptor &array,
+    ReductionCharOperation<char32_t>, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const char32_t *identity = nullptr,
+    bool ordered = true);
+
+void RTDECL(ReduceDerivedType)(char *result, const Descriptor &array,
+    ReductionDerivedTypeOperation, const char *source, int line, int dim = 0,
+    const Descriptor *mask = nullptr, const char *identity = nullptr,
+    bool ordered = true);
+void RTDECL(ReduceDerivedTypeDim)(Descriptor &result, const Descriptor &array,
+    ReductionDerivedTypeOperation, const char *source, int line, int dim,
+    const Descriptor *mask = nullptr, const char *identity = nullptr,
+    bool ordered = true);
+
+} // extern "C"
+} // namespace Fortran::runtime
+#endif // FORTRAN_RUNTIME_REDUCE_H_

diff  --git a/flang/include/flang/Runtime/reduction.h b/flang/include/flang/Runtime/reduction.h
index 5b607765857523..97986c12e8a10e 100644
--- a/flang/include/flang/Runtime/reduction.h
+++ b/flang/include/flang/Runtime/reduction.h
@@ -89,9 +89,11 @@ void RTDECL(CppSumComplex4)(std::complex<float> &, const Descriptor &,
 void RTDECL(CppSumComplex8)(std::complex<double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#if LDBL_MANT_DIG == 64
 void RTDECL(CppSumComplex10)(std::complex<long double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#endif
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 void RTDECL(CppSumComplex16)(std::complex<CppFloat128Type> &,
     const Descriptor &, const char *source, int line, int dim = 0,

diff  --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index d625f8c2f7fc11..51a16ee155fabb 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -1588,6 +1588,9 @@ static void CheckReduce(
       procChars->dummyArguments.size() != 2 || !procChars->functionResult) {
     messages.Say(
         "OPERATION= argument of REDUCE() must be a pure function of two data arguments"_err_en_US);
+  } else if (procChars->attrs.test(characteristics::Procedure::Attr::BindC)) {
+    messages.Say(
+        "A BIND(C) OPERATION= argument of REDUCE() is not supported"_err_en_US);
   } else if (!result || result->Rank() != 0) {
     messages.Say(
         "OPERATION= argument of REDUCE() must be a scalar function"_err_en_US);

diff  --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index 335ef370727461..facd14432b3eed 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -153,6 +153,7 @@ set(sources
   pseudo-unit.cpp
   ragged.cpp
   random.cpp
+  reduce.cpp
   reduction.cpp
   stat.cpp
   stop.cpp

diff  --git a/flang/runtime/complex-reduction.c b/flang/runtime/complex-reduction.c
index c91d1253991176..7654de8080a152 100644
--- a/flang/runtime/complex-reduction.c
+++ b/flang/runtime/complex-reduction.c
@@ -155,3 +155,25 @@ ADAPT_REDUCTION(DotProductComplex10, long_double_Complex_t,
 ADAPT_REDUCTION(DotProductComplex16, CFloat128ComplexType, CppComplexFloat128,
     CMPLXF128, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
 #endif
+
+/* REDUCE() */
+#define RARGS REDUCE_ARGS(float_Complex_t)
+ADAPT_REDUCTION(ReduceComplex4, float_Complex_t, CppComplexFloat, CMPLXF, RARGS,
+    REDUCE_ARG_NAMES)
+#undef RARGS
+#define RARGS REDUCE_ARGS(double_Complex_t)
+ADAPT_REDUCTION(ReduceComplex8, double_Complex_t, CppComplexDouble, CMPLX,
+    RARGS, REDUCE_ARG_NAMES)
+#undef RARGS
+#if LDBL_MANT_DIG == 64
+#define RARGS REDUCE_ARGS(long_double_Complex_t)
+ADAPT_REDUCTION(ReduceComplex10, long_double_Complex_t, CppComplexLongDouble,
+    CMPLXL, RARGS, REDUCE_ARG_NAMES)
+#undef RARGS
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+#define RARGS REDUCE_ARGS(CFloat128ComplexType)
+ADAPT_REDUCTION(ReduceComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, RARGS, REDUCE_ARG_NAMES)
+#undef RARGS
+#endif

diff  --git a/flang/runtime/complex-reduction.h b/flang/runtime/complex-reduction.h
index 1d37b235d5194b..98b20d1e592be8 100644
--- a/flang/runtime/complex-reduction.h
+++ b/flang/runtime/complex-reduction.h
@@ -69,4 +69,49 @@ long_double_Complex_t RTNAME(DotProductComplex10)(DOT_PRODUCT_ARGS);
 CFloat128ComplexType RTNAME(DotProductComplex16)(DOT_PRODUCT_ARGS);
 #endif
 
+#define REDUCE_ARGS(T) \
+  T##_op operation, const struct CppDescriptor *x, \
+      const struct CppDescriptor *y, const char *source, int line, \
+      int dim /*=0*/, const struct CppDescriptor *mask /*=NULL*/, \
+      const T *identity /*=NULL*/, _Bool ordered /*=true*/
+#define REDUCE_ARG_NAMES \
+  operation, x, y, source, line, dim, mask, identity, ordered
+
+typedef float_Complex_t (*float_Complex_t_op)(
+    const float_Complex_t *, const float_Complex_t *);
+typedef double_Complex_t (*double_Complex_t_op)(
+    const double_Complex_t *, const double_Complex_t *);
+typedef long_double_Complex_t (*long_double_Complex_t_op)(
+    const long_double_Complex_t *, const long_double_Complex_t *);
+
+float_Complex_t RTNAME(ReduceComplex2)(REDUCE_ARGS(float_Complex_t));
+float_Complex_t RTNAME(ReduceComplex3)(REDUCE_ARGS(float_Complex_t));
+float_Complex_t RTNAME(ReduceComplex4)(REDUCE_ARGS(float_Complex_t));
+double_Complex_t RTNAME(ReduceComplex8)(REDUCE_ARGS(double_Complex_t));
+long_double_Complex_t RTNAME(ReduceComplex10)(
+    REDUCE_ARGS(long_double_Complex_t));
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+typedef CFloat128ComplexType (*CFloat128ComplexType_op)(
+    const CFloat128ComplexType *, const CFloat128ComplexType *);
+CFloat128ComplexType RTNAME(ReduceComplex16)(REDUCE_ARGS(CFloat128ComplexType));
+#endif
+
+#define REDUCE_DIM_ARGS(T) \
+  struct CppDescriptor *result, T##_op operation, \
+      const struct CppDescriptor *x, const struct CppDescriptor *y, \
+      const char *source, int line, int dim, \
+      const struct CppDescriptor *mask /*=NULL*/, const T *identity /*=NULL*/, \
+      _Bool ordered /*=true*/
+#define REDUCE_DIM_ARG_NAMES \
+  result, operation, x, y, source, line, dim, mask, identity, ordered
+
+void RTNAME(ReduceComplex2Dim)(REDUCE_DIM_ARGS(float_Complex_t));
+void RTNAME(ReduceComplex3Dim)(REDUCE_DIM_ARGS(float_Complex_t));
+void RTNAME(ReduceComplex4Dim)(REDUCE_DIM_ARGS(float_Complex_t));
+void RTNAME(ReduceComplex8Dim)(REDUCE_DIM_ARGS(double_Complex_t));
+void RTNAME(ReduceComplex10Dim)(REDUCE_DIM_ARGS(long_double_Complex_t));
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTNAME(ReduceComplex16Dim)(REDUCE_DIM_ARGS(CFloat128ComplexType));
+#endif
+
 #endif // FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_

diff  --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
index 0f259f4715bf21..3a86c9fa7375e1 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -1147,7 +1147,7 @@ bool IONAME(OutputInteger8)(Cookie cookie, std::int8_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger8")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Integer, 1, reinterpret_cast<void *>(&n), 0);
@@ -1158,7 +1158,7 @@ bool IONAME(OutputInteger16)(Cookie cookie, std::int16_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger16")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Integer, 2, reinterpret_cast<void *>(&n), 0);
@@ -1170,7 +1170,7 @@ bool IODEF(OutputInteger32)(Cookie cookie, std::int32_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger32")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Integer, 4, reinterpret_cast<void *>(&n), 0);
@@ -1182,7 +1182,7 @@ bool IONAME(OutputInteger64)(Cookie cookie, std::int64_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger64")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Integer, 8, reinterpret_cast<void *>(&n), 0);
@@ -1194,7 +1194,7 @@ bool IONAME(OutputInteger128)(Cookie cookie, common::int128_t n) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputInteger128")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Integer, 16, reinterpret_cast<void *>(&n), 0);
@@ -1206,7 +1206,7 @@ bool IONAME(InputInteger)(Cookie cookie, std::int64_t &n, int kind) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputInteger")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Integer, kind, reinterpret_cast<void *>(&n), 0);
@@ -1217,7 +1217,7 @@ bool IONAME(OutputReal32)(Cookie cookie, float x) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputReal32")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(TypeCategory::Real, 4, reinterpret_cast<void *>(&x), 0);
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
@@ -1227,7 +1227,7 @@ bool IONAME(OutputReal64)(Cookie cookie, double x) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputReal64")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(TypeCategory::Real, 8, reinterpret_cast<void *>(&x), 0);
   return descr::DescriptorIO<Direction::Output>(*cookie, descriptor);
@@ -1237,7 +1237,7 @@ bool IONAME(InputReal32)(Cookie cookie, float &x) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputReal32")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(TypeCategory::Real, 4, reinterpret_cast<void *>(&x), 0);
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
@@ -1247,7 +1247,7 @@ bool IONAME(InputReal64)(Cookie cookie, double &x) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputReal64")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(TypeCategory::Real, 8, reinterpret_cast<void *>(&x), 0);
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
@@ -1258,7 +1258,7 @@ bool IONAME(OutputComplex32)(Cookie cookie, float r, float i) {
     return false;
   }
   float z[2]{r, i};
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Complex, 4, reinterpret_cast<void *>(&z), 0);
@@ -1270,7 +1270,7 @@ bool IONAME(OutputComplex64)(Cookie cookie, double r, double i) {
     return false;
   }
   double z[2]{r, i};
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Complex, 8, reinterpret_cast<void *>(&z), 0);
@@ -1281,7 +1281,7 @@ bool IONAME(InputComplex32)(Cookie cookie, float z[2]) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputComplex32")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Complex, 4, reinterpret_cast<void *>(z), 0);
@@ -1292,7 +1292,7 @@ bool IONAME(InputComplex64)(Cookie cookie, double z[2]) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputComplex64")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Complex, 8, reinterpret_cast<void *>(z), 0);
@@ -1304,7 +1304,7 @@ bool IONAME(OutputCharacter)(
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputCharacter")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       kind, length, reinterpret_cast<void *>(const_cast<char *>(x)), 0);
@@ -1320,7 +1320,7 @@ bool IONAME(InputCharacter)(
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputCharacter")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(kind, length, reinterpret_cast<void *>(x), 0);
   return descr::DescriptorIO<Direction::Input>(*cookie, descriptor);
@@ -1334,7 +1334,7 @@ bool IONAME(OutputLogical)(Cookie cookie, bool truth) {
   if (!cookie->CheckFormattedStmtType<Direction::Output>("OutputLogical")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Logical, sizeof truth, reinterpret_cast<void *>(&truth), 0);
@@ -1345,7 +1345,7 @@ bool IONAME(InputLogical)(Cookie cookie, bool &truth) {
   if (!cookie->CheckFormattedStmtType<Direction::Input>("InputLogical")) {
     return false;
   }
-  StaticDescriptor staticDescriptor;
+  StaticDescriptor<0> staticDescriptor;
   Descriptor &descriptor{staticDescriptor.descriptor()};
   descriptor.Establish(
       TypeCategory::Logical, sizeof truth, reinterpret_cast<void *>(&truth), 0);

diff  --git a/flang/runtime/reduce.cpp b/flang/runtime/reduce.cpp
new file mode 100644
index 00000000000000..f8a5221a1ebf76
--- /dev/null
+++ b/flang/runtime/reduce.cpp
@@ -0,0 +1,526 @@
+//===-- runtime/reduce.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REDUCE() implementation
+
+#include "flang/Runtime/reduce.h"
+#include "reduction-templates.h"
+#include "terminator.h"
+#include "tools.h"
+#include "flang/Runtime/descriptor.h"
+
+namespace Fortran::runtime {
+
+template <typename T> class ReduceAccumulator {
+public:
+  RT_API_ATTRS ReduceAccumulator(const Descriptor &array,
+      ReductionOperation<T> operation, const T *identity,
+      Terminator &terminator)
+      : array_{array}, operation_{operation}, identity_{identity},
+        terminator_{terminator} {}
+  RT_API_ATTRS void Reinitialize() { result_.reset(); }
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
+    const auto *operand{array_.Element<A>(at)};
+    if (result_) {
+      result_ = operation_(&*result_, operand);
+    } else {
+      result_ = *operand;
+    }
+    return true;
+  }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *to, int /*zeroBasedDim*/ = -1) {
+    if (result_) {
+      *to = *result_;
+    } else if (identity_) {
+      *to = *identity_;
+    } else {
+      terminator_.Crash("REDUCE() without IDENTITY= has no result");
+    }
+  }
+
+private:
+  const Descriptor &array_;
+  common::optional<T> result_;
+  ReductionOperation<T> operation_;
+  const T *identity_{nullptr};
+  Terminator &terminator_;
+};
+
+template <typename T, typename OP, bool hasLength>
+class BufferedReduceAccumulator {
+public:
+  RT_API_ATTRS BufferedReduceAccumulator(const Descriptor &array, OP operation,
+      const T *identity, Terminator &terminator)
+      : array_{array}, operation_{operation}, identity_{identity},
+        terminator_{terminator} {}
+  RT_API_ATTRS void Reinitialize() { activeTemp_ = -1; }
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
+    const auto *operand{array_.Element<A>(at)};
+    if (activeTemp_ >= 0) {
+      if constexpr (hasLength) {
+        operation_(&*temp_[1 - activeTemp_], length_, &*temp_[activeTemp_],
+            operand, length_, length_);
+      } else {
+        operation_(&*temp_[1 - activeTemp_], &*temp_[activeTemp_], operand);
+      }
+      activeTemp_ = 1 - activeTemp_;
+    } else {
+      activeTemp_ = 0;
+      std::memcpy(&*temp_[activeTemp_], operand, elementBytes_);
+    }
+    return true;
+  }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *to, int /*zeroBasedDim*/ = -1) {
+    if (activeTemp_ >= 0) {
+      std::memcpy(to, &*temp_[activeTemp_], elementBytes_);
+    } else if (identity_) {
+      std::memcpy(to, identity_, elementBytes_);
+    } else {
+      terminator_.Crash("REDUCE() without IDENTITY= has no result");
+    }
+  }
+
+private:
+  const Descriptor &array_;
+  OP operation_;
+  const T *identity_{nullptr};
+  Terminator &terminator_;
+  std::size_t elementBytes_{array_.ElementBytes()};
+  OwningPtr<T> temp_[2]{SizedNew<T>{terminator_}(elementBytes_),
+      SizedNew<T>{terminator_}(elementBytes_)};
+  int activeTemp_{-1};
+  std::size_t length_{elementBytes_ / sizeof(T)};
+};
+
+extern "C" {
+RT_EXT_API_GROUP_BEGIN
+
+std::int8_t RTDEF(ReduceInteger1)(const Descriptor &array,
+    ReductionOperation<std::int8_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int8_t *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  return GetTotalReduction<TypeCategory::Integer, 1>(array, source, line, dim,
+      mask,
+      ReduceAccumulator<std::int8_t>{array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(ReduceInteger1Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int8_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int8_t *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<std::int8_t>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Integer, 1>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+std::int16_t RTDEF(ReduceInteger2)(const Descriptor &array,
+    ReductionOperation<std::int16_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int16_t *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  return GetTotalReduction<TypeCategory::Integer, 2>(array, source, line, dim,
+      mask,
+      ReduceAccumulator<std::int16_t>{array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(ReduceInteger2Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int16_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int16_t *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<std::int16_t>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Integer, 2>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+std::int32_t RTDEF(ReduceInteger4)(const Descriptor &array,
+    ReductionOperation<std::int32_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int32_t *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  return GetTotalReduction<TypeCategory::Integer, 4>(array, source, line, dim,
+      mask,
+      ReduceAccumulator<std::int32_t>{array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(ReduceInteger4Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int32_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int32_t *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<std::int32_t>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Integer, 4>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+std::int64_t RTDEF(ReduceInteger8)(const Descriptor &array,
+    ReductionOperation<std::int64_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int64_t *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  return GetTotalReduction<TypeCategory::Integer, 8>(array, source, line, dim,
+      mask,
+      ReduceAccumulator<std::int64_t>{array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(ReduceInteger8Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int64_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int64_t *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<std::int64_t>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Integer, 8>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+#ifdef __SIZEOF_INT128__
+common::int128_t RTDEF(ReduceInteger16)(const Descriptor &array,
+    ReductionOperation<common::int128_t> operation, const char *source,
+    int line, int dim, const Descriptor *mask, const common::int128_t *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  return GetTotalReduction<TypeCategory::Integer, 16>(array, source, line, dim,
+      mask,
+      ReduceAccumulator<common::int128_t>{
+          array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(ReduceInteger16Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<common::int128_t> operation, const char *source,
+    int line, int dim, const Descriptor *mask, const common::int128_t *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<common::int128_t>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Integer, 16>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+#endif
+
+// TODO: real/complex(2 & 3)
+float RTDEF(ReduceReal4)(const Descriptor &array,
+    ReductionOperation<float> operation, const char *source, int line, int dim,
+    const Descriptor *mask, const float *identity, bool ordered) {
+  Terminator terminator{source, line};
+  return GetTotalReduction<TypeCategory::Real, 4>(array, source, line, dim,
+      mask, ReduceAccumulator<float>{array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(ReduceReal4Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<float> operation, const char *source, int line, int dim,
+    const Descriptor *mask, const float *identity, bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<float>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Real, 4>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+double RTDEF(ReduceReal8)(const Descriptor &array,
+    ReductionOperation<double> operation, const char *source, int line, int dim,
+    const Descriptor *mask, const double *identity, bool ordered) {
+  Terminator terminator{source, line};
+  return GetTotalReduction<TypeCategory::Real, 8>(array, source, line, dim,
+      mask, ReduceAccumulator<double>{array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(ReduceReal8Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<double> operation, const char *source, int line, int dim,
+    const Descriptor *mask, const double *identity, bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<double>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Real, 8>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+#if LDBL_MANT_DIG == 64
+long double RTDEF(ReduceReal10)(const Descriptor &array,
+    ReductionOperation<long double> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const long double *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  return GetTotalReduction<TypeCategory::Real, 10>(array, source, line, dim,
+      mask,
+      ReduceAccumulator<long double>{array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(ReduceReal10Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<long double> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const long double *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<long double>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Real, 10>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppFloat128Type RTDEF(ReduceReal16)(const Descriptor &array,
+    ReductionOperation<CppFloat128Type> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const CppFloat128Type *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  return GetTotalReduction<TypeCategory::Real, 16>(array, source, line, dim,
+      mask,
+      ReduceAccumulator<CppFloat128Type>{
+          array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(ReduceReal16Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<CppFloat128Type> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const CppFloat128Type *identity,
+    bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<CppFloat128Type>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Real, 16>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+#endif
+
+void RTDEF(CppReduceComplex4)(std::complex<float> &result,
+    const Descriptor &array, ReductionOperation<std::complex<float>> operation,
+    const char *source, int line, int dim, const Descriptor *mask,
+    const std::complex<float> *identity, bool ordered) {
+  Terminator terminator{source, line};
+  result = GetTotalReduction<TypeCategory::Complex, 4>(array, source, line, dim,
+      mask,
+      ReduceAccumulator<std::complex<float>>{
+          array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(CppReduceComplex4Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::complex<float>> operation, const char *source,
+    int line, int dim, const Descriptor *mask,
+    const std::complex<float> *identity, bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<std::complex<float>>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Complex, 4>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+void RTDEF(CppReduceComplex8)(std::complex<double> &result,
+    const Descriptor &array, ReductionOperation<std::complex<double>> operation,
+    const char *source, int line, int dim, const Descriptor *mask,
+    const std::complex<double> *identity, bool ordered) {
+  Terminator terminator{source, line};
+  result = GetTotalReduction<TypeCategory::Complex, 8>(array, source, line, dim,
+      mask,
+      ReduceAccumulator<std::complex<double>>{
+          array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(CppReduceComplex8Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::complex<double>> operation, const char *source,
+    int line, int dim, const Descriptor *mask,
+    const std::complex<double> *identity, bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<std::complex<double>>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Complex, 8>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+#if LDBL_MANT_DIG == 64
+void RTDEF(CppReduceComplex10)(std::complex<long double> &result,
+    const Descriptor &array,
+    ReductionOperation<std::complex<long double>> operation, const char *source,
+    int line, int dim, const Descriptor *mask,
+    const std::complex<long double> *identity, bool ordered) {
+  Terminator terminator{source, line};
+  result = GetTotalReduction<TypeCategory::Complex, 10>(array, source, line,
+      dim, mask,
+      ReduceAccumulator<std::complex<long double>>{
+          array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(CppReduceComplex10Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::complex<long double>> operation, const char *source,
+    int line, int dim, const Descriptor *mask,
+    const std::complex<long double> *identity, bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<std::complex<long double>>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Complex, 10>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDEF(CppReduceComplex16)(std::complex<CppFloat128Type> &result,
+    const Descriptor &array,
+    ReductionOperation<std::complex<CppFloat128Type>> operation,
+    const char *source, int line, int dim, const Descriptor *mask,
+    const std::complex<CppFloat128Type> *identity, bool ordered) {
+  Terminator terminator{source, line};
+  result = GetTotalReduction<TypeCategory::Complex, 16>(array, source, line,
+      dim, mask,
+      ReduceAccumulator<std::complex<CppFloat128Type>>{
+          array, operation, identity, terminator},
+      "REDUCE");
+}
+void RTDEF(CppReduceComplex16Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::complex<CppFloat128Type>> operation,
+    const char *source, int line, int dim, const Descriptor *mask,
+    const std::complex<CppFloat128Type> *identity, bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = ReduceAccumulator<std::complex<CppFloat128Type>>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Complex, 16>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+#endif
+
+bool RTDEF(ReduceLogical1)(const Descriptor &array,
+    ReductionOperation<std::int8_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int8_t *identity,
+    bool ordered) {
+  return RTNAME(ReduceInteger1)(
+             array, operation, source, line, dim, mask, identity, ordered) != 0;
+}
+void RTDEF(ReduceLogical1Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int8_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int8_t *identity,
+    bool ordered) {
+  RTNAME(ReduceInteger1Dim)
+  (result, array, operation, source, line, dim, mask, identity, ordered);
+}
+bool RTDEF(ReduceLogical2)(const Descriptor &array,
+    ReductionOperation<std::int16_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int16_t *identity,
+    bool ordered) {
+  return RTNAME(ReduceInteger2)(
+             array, operation, source, line, dim, mask, identity, ordered) != 0;
+}
+void RTDEF(ReduceLogical2Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int16_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int16_t *identity,
+    bool ordered) {
+  RTNAME(ReduceInteger2Dim)
+  (result, array, operation, source, line, dim, mask, identity, ordered);
+}
+bool RTDEF(ReduceLogical4)(const Descriptor &array,
+    ReductionOperation<std::int32_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int32_t *identity,
+    bool ordered) {
+  return RTNAME(ReduceInteger4)(
+             array, operation, source, line, dim, mask, identity, ordered) != 0;
+}
+void RTDEF(ReduceLogical4Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int32_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int32_t *identity,
+    bool ordered) {
+  RTNAME(ReduceInteger4Dim)
+  (result, array, operation, source, line, dim, mask, identity, ordered);
+}
+bool RTDEF(ReduceLogical8)(const Descriptor &array,
+    ReductionOperation<std::int64_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int64_t *identity,
+    bool ordered) {
+  return RTNAME(ReduceInteger8)(
+             array, operation, source, line, dim, mask, identity, ordered) != 0;
+}
+void RTDEF(ReduceLogical8Dim)(Descriptor &result, const Descriptor &array,
+    ReductionOperation<std::int64_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const std::int64_t *identity,
+    bool ordered) {
+  RTNAME(ReduceInteger8Dim)
+  (result, array, operation, source, line, dim, mask, identity, ordered);
+}
+
+void RTDEF(ReduceChar1)(char *result, const Descriptor &array,
+    ReductionCharOperation<char> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const char *identity, bool ordered) {
+  Terminator terminator{source, line};
+  BufferedReduceAccumulator<char, ReductionCharOperation<char>,
+      /*hasLength=*/true>
+      accumulator{array, operation, identity, terminator};
+  DoTotalReduction<char>(array, dim, mask, accumulator, "REDUCE", terminator);
+  accumulator.GetResult(result);
+}
+void RTDEF(ReduceCharacter1Dim)(Descriptor &result, const Descriptor &array,
+    ReductionCharOperation<char> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const char *identity, bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = BufferedReduceAccumulator<char,
+      ReductionCharOperation<char>, /*hasLength=*/true>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Character, 1>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+void RTDEF(ReduceChar2)(char16_t *result, const Descriptor &array,
+    ReductionCharOperation<char16_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const char16_t *identity, bool ordered) {
+  Terminator terminator{source, line};
+  BufferedReduceAccumulator<char16_t, ReductionCharOperation<char16_t>,
+      /*hasLength=*/true>
+      accumulator{array, operation, identity, terminator};
+  DoTotalReduction<char16_t>(
+      array, dim, mask, accumulator, "REDUCE", terminator);
+  accumulator.GetResult(result);
+}
+void RTDEF(ReduceCharacter2Dim)(Descriptor &result, const Descriptor &array,
+    ReductionCharOperation<char16_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const char16_t *identity, bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = BufferedReduceAccumulator<char16_t,
+      ReductionCharOperation<char16_t>, /*hasLength=*/true>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Character, 2>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+void RTDEF(ReduceChar4)(char32_t *result, const Descriptor &array,
+    ReductionCharOperation<char32_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const char32_t *identity, bool ordered) {
+  Terminator terminator{source, line};
+  BufferedReduceAccumulator<char32_t, ReductionCharOperation<char32_t>,
+      /*hasLength=*/true>
+      accumulator{array, operation, identity, terminator};
+  DoTotalReduction<char32_t>(
+      array, dim, mask, accumulator, "REDUCE", terminator);
+  accumulator.GetResult(result);
+}
+void RTDEF(ReduceCharacter4Dim)(Descriptor &result, const Descriptor &array,
+    ReductionCharOperation<char32_t> operation, const char *source, int line,
+    int dim, const Descriptor *mask, const char32_t *identity, bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = BufferedReduceAccumulator<char32_t,
+      ReductionCharOperation<char32_t>, /*hasLength=*/true>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Character, 4>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+
+void RTDEF(ReduceDerivedType)(char *result, const Descriptor &array,
+    ReductionDerivedTypeOperation operation, const char *source, int line,
+    int dim, const Descriptor *mask, const char *identity, bool ordered) {
+  Terminator terminator{source, line};
+  BufferedReduceAccumulator<char, ReductionDerivedTypeOperation,
+      /*hasLength=*/false>
+      accumulator{array, operation, identity, terminator};
+  DoTotalReduction<char>(array, dim, mask, accumulator, "REDUCE", terminator);
+  accumulator.GetResult(result);
+}
+void RTDEF(ReduceDerivedTypeDim)(Descriptor &result, const Descriptor &array,
+    ReductionDerivedTypeOperation operation, const char *source, int line,
+    int dim, const Descriptor *mask, const char *identity, bool ordered) {
+  Terminator terminator{source, line};
+  using Accumulator = BufferedReduceAccumulator<char,
+      ReductionDerivedTypeOperation, /*hasLength=*/false>;
+  Accumulator accumulator{array, operation, identity, terminator};
+  PartialReduction<Accumulator, TypeCategory::Derived, 0>(result, array,
+      array.ElementBytes(), dim, mask, terminator, "REDUCE", accumulator);
+}
+
+RT_EXT_API_GROUP_END
+} // extern "C"
+} // namespace Fortran::runtime

diff  --git a/flang/runtime/reduction-templates.h b/flang/runtime/reduction-templates.h
index 5b793deb2a123d..f8e6f6095509e6 100644
--- a/flang/runtime/reduction-templates.h
+++ b/flang/runtime/reduction-templates.h
@@ -53,9 +53,9 @@ inline RT_API_ATTRS void DoTotalReduction(const Descriptor &x, int dim,
   x.GetLowerBounds(xAt);
   if (mask) {
     CheckConformability(x, *mask, terminator, intrinsic, "ARRAY", "MASK");
-    SubscriptValue maskAt[maxRank];
-    mask->GetLowerBounds(maskAt);
     if (mask->rank() > 0) {
+      SubscriptValue maskAt[maxRank];
+      mask->GetLowerBounds(maskAt);
       for (auto elements{x.Elements()}; elements--;
            x.IncrementSubscripts(xAt), mask->IncrementSubscripts(maskAt)) {
         if (IsLogicalElementTrue(*mask, maskAt)) {
@@ -65,7 +65,7 @@ inline RT_API_ATTRS void DoTotalReduction(const Descriptor &x, int dim,
         }
       }
       return;
-    } else if (!IsLogicalElementTrue(*mask, maskAt)) {
+    } else if (!IsLogicalScalarTrue(*mask)) {
       // scalar MASK=.FALSE.: return identity value
       return;
     }
@@ -86,13 +86,22 @@ inline RT_API_ATTRS CppTypeFor<CAT, KIND> GetTotalReduction(const Descriptor &x,
   RUNTIME_CHECK(terminator, TypeCode(CAT, KIND) == x.type());
   using CppType = CppTypeFor<CAT, KIND>;
   DoTotalReduction<CppType>(x, dim, mask, accumulator, intrinsic, terminator);
-  CppType result;
+  if constexpr (std::is_void_v<CppType>) {
+    // Result is returned from accumulator, as in REDUCE() for derived type
 #ifdef _MSC_VER // work around MSVC spurious error
-  accumulator.GetResult(&result);
+    accumulator.GetResult();
 #else
-  accumulator.template GetResult(&result);
+    accumulator.template GetResult();
 #endif
-  return result;
+  } else {
+    CppType result;
+#ifdef _MSC_VER // work around MSVC spurious error
+    accumulator.GetResult(&result);
+#else
+    accumulator.template GetResult(&result);
+#endif
+    return result;
+  }
 }
 
 // For reductions on a dimension, e.g. SUM(array,DIM=2) where the shape
@@ -164,35 +173,6 @@ inline RT_API_ATTRS void ReduceDimMaskToScalar(const Descriptor &x,
 #endif
 }
 
-// Utility: establishes & allocates the result array for a partial
-// reduction (i.e., one with DIM=).
-static RT_API_ATTRS void CreatePartialReductionResult(Descriptor &result,
-    const Descriptor &x, std::size_t resultElementSize, int dim,
-    Terminator &terminator, const char *intrinsic, TypeCode typeCode) {
-  int xRank{x.rank()};
-  if (dim < 1 || dim > xRank) {
-    terminator.Crash(
-        "%s: bad DIM=%d for ARRAY with rank %d", intrinsic, dim, xRank);
-  }
-  int zeroBasedDim{dim - 1};
-  SubscriptValue resultExtent[maxRank];
-  for (int j{0}; j < zeroBasedDim; ++j) {
-    resultExtent[j] = x.GetDimension(j).Extent();
-  }
-  for (int j{zeroBasedDim + 1}; j < xRank; ++j) {
-    resultExtent[j - 1] = x.GetDimension(j).Extent();
-  }
-  result.Establish(typeCode, resultElementSize, nullptr, xRank - 1,
-      resultExtent, CFI_attribute_allocatable);
-  for (int j{0}; j + 1 < xRank; ++j) {
-    result.GetDimension(j).SetBounds(1, resultExtent[j]);
-  }
-  if (int stat{result.Allocate()}) {
-    terminator.Crash(
-        "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
-  }
-}
-
 // Partial reductions with DIM=
 
 template <typename ACCUMULATOR, TypeCategory CAT, int KIND>
@@ -208,7 +188,6 @@ inline RT_API_ATTRS void PartialReduction(Descriptor &result,
   using CppType = CppTypeFor<CAT, KIND>;
   if (mask) {
     CheckConformability(x, *mask, terminator, intrinsic, "ARRAY", "MASK");
-    SubscriptValue maskAt[maxRank]; // contents unused
     if (mask->rank() > 0) {
       for (auto n{result.Elements()}; n-- > 0; result.IncrementSubscripts(at)) {
         accumulator.Reinitialize();
@@ -216,7 +195,7 @@ inline RT_API_ATTRS void PartialReduction(Descriptor &result,
             x, dim - 1, at, *mask, result.Element<CppType>(at), accumulator);
       }
       return;
-    } else if (!IsLogicalElementTrue(*mask, maskAt)) {
+    } else if (!IsLogicalScalarTrue(*mask)) {
       // scalar MASK=.FALSE.
       accumulator.Reinitialize();
       for (auto n{result.Elements()}; n-- > 0; result.IncrementSubscripts(at)) {

diff  --git a/flang/runtime/tools.cpp b/flang/runtime/tools.cpp
index 71022c7a8c179d..73d6c2cf7e1d2b 100644
--- a/flang/runtime/tools.cpp
+++ b/flang/runtime/tools.cpp
@@ -238,5 +238,34 @@ template <int KIND> struct FitsInIntegerKind {
   }
 };
 
+// Utility: establishes & allocates the result array for a partial
+// reduction (i.e., one with DIM=).
+RT_API_ATTRS void CreatePartialReductionResult(Descriptor &result,
+    const Descriptor &x, std::size_t resultElementSize, int dim,
+    Terminator &terminator, const char *intrinsic, TypeCode typeCode) {
+  int xRank{x.rank()};
+  if (dim < 1 || dim > xRank) {
+    terminator.Crash(
+        "%s: bad DIM=%d for ARRAY with rank %d", intrinsic, dim, xRank);
+  }
+  int zeroBasedDim{dim - 1};
+  SubscriptValue resultExtent[maxRank];
+  for (int j{0}; j < zeroBasedDim; ++j) {
+    resultExtent[j] = x.GetDimension(j).Extent();
+  }
+  for (int j{zeroBasedDim + 1}; j < xRank; ++j) {
+    resultExtent[j - 1] = x.GetDimension(j).Extent();
+  }
+  result.Establish(typeCode, resultElementSize, nullptr, xRank - 1,
+      resultExtent, CFI_attribute_allocatable);
+  for (int j{0}; j + 1 < xRank; ++j) {
+    result.GetDimension(j).SetBounds(1, resultExtent[j]);
+  }
+  if (int stat{result.Allocate()}) {
+    terminator.Crash(
+        "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
+  }
+}
+
 RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime

diff  --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index c70a1b438e3329..5d7d99c08179dd 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -62,7 +62,7 @@ RT_API_ATTRS int IdentifyValue(
 RT_API_ATTRS void ToFortranDefaultCharacter(
     char *to, std::size_t toLength, const char *from);
 
-// Utility for dealing with elemental LOGICAL arguments
+// Utilities for dealing with elemental LOGICAL arguments
 inline RT_API_ATTRS bool IsLogicalElementTrue(
     const Descriptor &logical, const SubscriptValue at[]) {
   // A LOGICAL value is false if and only if all of its bytes are zero.
@@ -74,6 +74,16 @@ inline RT_API_ATTRS bool IsLogicalElementTrue(
   }
   return false;
 }
+inline RT_API_ATTRS bool IsLogicalScalarTrue(const Descriptor &logical) {
+  // A LOGICAL value is false if and only if all of its bytes are zero.
+  const char *p{logical.OffsetElement<char>()};
+  for (std::size_t j{logical.ElementBytes()}; j-- > 0; ++p) {
+    if (*p) {
+      return true;
+    }
+  }
+  return false;
+}
 
 // Check array conformability; a scalar 'x' conforms.  Crashes on error.
 RT_API_ATTRS void CheckConformability(const Descriptor &to, const Descriptor &x,
@@ -511,5 +521,9 @@ RT_API_ATTRS void CopyAndPad(
   }
 }
 
+RT_API_ATTRS void CreatePartialReductionResult(Descriptor &result,
+    const Descriptor &x, std::size_t resultElementSize, int dim, Terminator &,
+    const char *intrinsic, TypeCode);
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_TOOLS_H_

diff  --git a/flang/unittests/Runtime/Reduction.cpp b/flang/unittests/Runtime/Reduction.cpp
index b17988bc17699d..b2661e78abdf58 100644
--- a/flang/unittests/Runtime/Reduction.cpp
+++ b/flang/unittests/Runtime/Reduction.cpp
@@ -13,6 +13,7 @@
 #include "flang/Runtime/allocatable.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/reduce.h"
 #include "flang/Runtime/type-code.h"
 #include <cstdint>
 #include <cstring>
@@ -634,3 +635,39 @@ TEST(Reductions, ExtremaReal16) {
   EXPECT_EQ(RTNAME(MaxvalReal16)(*maxArray, __FILE__, __LINE__), -1.0);
 }
 #endif // LDBL_MANT_DIG == 113 || HAS_FLOAT128
+
+static std::int32_t IAdd(const std::int32_t *x, const std::int32_t *y) {
+  return *x + *y;
+}
+
+static std::int32_t IMultiply(const std::int32_t *x, const std::int32_t *y) {
+  return *x * *y;
+}
+
+TEST(Reductions, ReduceInt4) {
+  auto intVector{MakeArray<TypeCategory::Integer, 4>(
+      std::vector<int>{4}, std::vector<std::int32_t>{1, 2, 3, 4})};
+  EXPECT_EQ(RTNAME(ReduceInteger4)(*intVector, IAdd, __FILE__, __LINE__), 10);
+  EXPECT_EQ(
+      RTNAME(ReduceInteger4)(*intVector, IMultiply, __FILE__, __LINE__), 24);
+}
+TEST(Reductions, ReduceInt4Dim) {
+  auto intMatrix{MakeArray<TypeCategory::Integer, 4>(
+      std::vector<int>{2, 2}, std::vector<std::int32_t>{1, 2, 3, 4})};
+  StaticDescriptor<1, true> statDesc;
+  Descriptor &sums{statDesc.descriptor()};
+  RTNAME(ReduceInteger4Dim)(sums, *intMatrix, IAdd, __FILE__, __LINE__, 1);
+  EXPECT_EQ(sums.rank(), 1);
+  EXPECT_EQ(sums.GetDimension(0).LowerBound(), 1);
+  EXPECT_EQ(sums.GetDimension(0).Extent(), 2);
+  EXPECT_EQ(*sums.ZeroBasedIndexedElement<std::int32_t>(0), 3);
+  EXPECT_EQ(*sums.ZeroBasedIndexedElement<std::int32_t>(1), 7);
+  sums.Destroy();
+  RTNAME(ReduceInteger4Dim)(sums, *intMatrix, IAdd, __FILE__, __LINE__, 2);
+  EXPECT_EQ(sums.rank(), 1);
+  EXPECT_EQ(sums.GetDimension(0).LowerBound(), 1);
+  EXPECT_EQ(sums.GetDimension(0).Extent(), 2);
+  EXPECT_EQ(*sums.ZeroBasedIndexedElement<std::int32_t>(0), 4);
+  EXPECT_EQ(*sums.ZeroBasedIndexedElement<std::int32_t>(1), 6);
+  sums.Destroy();
+}


        


More information about the flang-commits mailing list