[flang-commits] [flang] 66abe64 - [flang][hlfir] add an optimized bufferization pass

Fri Aug 18 02:53:51 PDT 2023

Author: Tom Eccles
Date: 2023-08-18T09:51:22Z
New Revision: 66abe644663e84305534b44e035f6fbd383ae81f

URL: https://github.com/llvm/llvm-project/commit/66abe644663e84305534b44e035f6fbd383ae81f
DIFF: https://github.com/llvm/llvm-project/commit/66abe644663e84305534b44e035f6fbd383ae81f.diff

LOG: [flang][hlfir] add an optimized bufferization pass

This pass is intended to spot cases where we can do better than the
default bufferization and to rewrite those specific cases. Then the
default bufferization (bufferize-hlfir pass) can handle everything else.

The transformation added in this patch rewrites simple element-wise
updates to an array to a do-loop modifying the array in place instead of
creating and assigning an array temporary.

See the RFC at
https://discourse.llvm.org/t/rfc-hlfir-optimized-bufferization-for-elemental-array-updates

This patch gets the improvement to exchange2 but not the improvement to cam4
described in the RFC. I think the cam4 improvement will require better alias
analysis. I aim to follow up to fix this in a later patch. With changes
since the RFC, the pass improves polyhedron channel2 by about 52%.

Depends on: D156805 D157718 D157626

Differential Revision: https://reviews.llvm.org/D157107

Added: 
    flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
    flang/test/HLFIR/opt-bufferization.fir

Modified: 
    flang/include/flang/Optimizer/HLFIR/Passes.h
    flang/include/flang/Optimizer/HLFIR/Passes.td
    flang/include/flang/Tools/CLOptions.inc
    flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
    flang/test/Fir/basic-program.fir

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Optimizer/HLFIR/Passes.h b/flang/include/flang/Optimizer/HLFIR/Passes.h
index eb3cc14942e88c..3314e0b887f6eb 100644

--- a/flang/include/flang/Optimizer/HLFIR/Passes.h
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.h
@@ -28,6 +28,7 @@ std::unique_ptr<mlir::Pass> createLowerHLFIRIntrinsicsPass();
 std::unique_ptr<mlir::Pass> createSimplifyHLFIRIntrinsicsPass();
 std::unique_ptr<mlir::Pass> createInlineElementalsPass();
 std::unique_ptr<mlir::Pass> createLowerHLFIROrderedAssignmentsPass();
+std::unique_ptr<mlir::Pass> createOptimizedBufferizationPass();
 
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/HLFIR/Passes.h.inc"

diff  --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index 7e832a918f8e44..c6e503c3a2760e 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -20,6 +20,11 @@ def BufferizeHLFIR : Pass<"bufferize-hlfir", "::mlir::ModuleOp"> {
   let constructor = "hlfir::createBufferizeHLFIRPass()";
 }
 
+def OptimizedBufferization : Pass<"opt-bufferization", "::mlir::func::FuncOp"> {
+  let summary = "Special cases for hlfir.expr bufferization where we can avoid a temporary which would be created by the generic bufferization pass";
+  let constructor = "hlfir::createOptimizedBufferizationPass()";
+}
+
 def LowerHLFIRIntrinsics : Pass<"lower-hlfir-intrinsics", "::mlir::ModuleOp"> {
   let summary = "Lower HLFIR transformational intrinsic operations";
   let constructor = "hlfir::createLowerHLFIRIntrinsicsPass()";

diff  --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 16eb998d7ce1df..e2a071d7516979 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -243,6 +243,11 @@ inline void createHLFIRToFIRPassPipeline(
     pm.addPass(hlfir::createSimplifyHLFIRIntrinsicsPass());
   }
   pm.addPass(hlfir::createInlineElementalsPass());
+  if (optLevel.isOptimizingForSpeed()) {
+    addCanonicalizerPassWithoutRegionSimplification(pm);
+    pm.addPass(mlir::createCSEPass());
+    pm.addPass(hlfir::createOptimizedBufferizationPass());
+  }
   pm.addPass(hlfir::createLowerHLFIROrderedAssignmentsPass());
   pm.addPass(hlfir::createLowerHLFIRIntrinsicsPass());
   pm.addPass(hlfir::createBufferizeHLFIRPass());

diff  --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index bde1d47603d8f4..603b328a6823fe 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -8,6 +8,7 @@ add_flang_library(HLFIRTransforms
   LowerHLFIROrderedAssignments.cpp
   ScheduleOrderedAssignments.cpp
   SimplifyHLFIRIntrinsics.cpp
+  OptimizedBufferization.cpp
 
   DEPENDS
   FIRDialect

diff  --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
new file mode 100644
index 00000000000000..ee97ccfb7707f4
--- /dev/null
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -0,0 +1,388 @@
+//===- OptimizedBufferization.cpp - special cases for bufferization -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// In some special cases we can bufferize hlfir expressions in a more optimal
+// way so as to avoid creating temporaries. This pass handles these. It should
+// be run before the catch-all bufferization pass.
+//
+// This requires constant subexpression elimination to have already been run.
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Analysis/AliasAnalysis.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/HLFIRTools.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/HLFIR/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include <iterator>
+#include <memory>
+#include <mlir/Analysis/AliasAnalysis.h>
+#include <optional>
+
+namespace hlfir {
+#define GEN_PASS_DEF_OPTIMIZEDBUFFERIZATION
+#include "flang/Optimizer/HLFIR/Passes.h.inc"
+} // namespace hlfir
+
+#define DEBUG_TYPE "opt-bufferization"
+
+namespace {
+
+/// This transformation should match in place modification of arrays.
+/// It should match code of the form
+/// %array = some.operation // array has shape %shape
+/// %expr = hlfir.elemental %shape : [...] {
+/// bb0(%arg0: index)
+///   %0 = hlfir.designate %array(%arg0)
+///   [...] // no other reads or writes to %array
+///   hlfir.yield_element %element
+/// }
+/// hlfir.assign %expr to %array
+/// hlfir.destroy %expr
+///
+/// Or
+///
+/// %read_array = some.operation // shape %shape
+/// %expr = hlfir.elemental %shape : [...] {
+/// bb0(%arg0: index)
+///   %0 = hlfir.designate %read_array(%arg0)
+///   [...]
+///   hlfir.yield_element %element
+/// }
+/// %write_array = some.operation // with shape %shape
+/// [...] // operations which don't effect write_array
+/// hlfir.assign %expr to %write_array
+/// hlfir.destroy %expr
+///
+/// In these cases, it is safe to turn the elemental into a do loop and modify
+/// elements of %array in place without creating an extra temporary for the
+/// elemental. We must check that there are no reads from the array at indexes
+/// which might conflict with the assignment or any writes. For now we will keep
+/// that strict and say that all reads must be at the elemental index (it is
+/// probably safe to read from higher indices if lowering to an ordered loop).
+class ElementalAssignBufferization
+    : public mlir::OpRewritePattern<hlfir::ElementalOp> {
+private:
+  struct MatchInfo {
+    mlir::Value array;
+    hlfir::AssignOp assign;
+    hlfir::DestroyOp destroy;
+  };
+  /// determines if the transformation can be applied to this elemental
+  static std::optional<MatchInfo> findMatch(hlfir::ElementalOp elemental);
+
+public:
+  using mlir::OpRewritePattern<hlfir::ElementalOp>::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(hlfir::ElementalOp elemental,
+                  mlir::PatternRewriter &rewriter) const override;
+};
+
+/// recursively collect all effects between start and end (including start, not
+/// including end) start must properly dominate end, start and end must be in
+/// the same block. If any operations with unknown effects are found,
+/// std::nullopt is returned
+static std::optional<mlir::SmallVector<mlir::MemoryEffects::EffectInstance>>
+getEffectsBetween(mlir::Operation *start, mlir::Operation *end) {
+  mlir::SmallVector<mlir::MemoryEffects::EffectInstance> ret;
+  if (start == end)
+    return ret;
+  assert(start->getBlock() && end->getBlock() && "TODO: block arguments");
+  assert(start->getBlock() == end->getBlock());
+  assert(mlir::DominanceInfo{}.properlyDominates(start, end));
+
+  mlir::Operation *nextOp = start;
+  while (nextOp && nextOp != end) {
+    std::optional<mlir::SmallVector<mlir::MemoryEffects::EffectInstance>>
+        effects = mlir::getEffectsRecursively(nextOp);
+    if (!effects)
+      return std::nullopt;
+    ret.append(*effects);
+    nextOp = nextOp->getNextNode();
+  }
+  return ret;
+}
+
+/// If effect is a read or write on val, return whether it aliases.
+/// Otherwise return mlir::AliasResult::NoAlias
+static mlir::AliasResult
+containsReadOrWriteEffectOn(const mlir::MemoryEffects::EffectInstance &effect,
+                            mlir::Value val) {
+  fir::AliasAnalysis aliasAnalysis;
+
+  if (mlir::isa<mlir::MemoryEffects::Read, mlir::MemoryEffects::Write>(
+          effect.getEffect())) {
+    mlir::Value accessedVal = effect.getValue();
+    if (mlir::isa<fir::DebuggingResource>(effect.getResource()))
+      return mlir::AliasResult::NoAlias;
+    if (!accessedVal)
+      return mlir::AliasResult::MayAlias;
+    if (accessedVal == val)
+      return mlir::AliasResult::MustAlias;
+
+    // if the accessed value might alias val
+    mlir::AliasResult res = aliasAnalysis.alias(val, accessedVal);
+    if (!res.isNo())
+      return res;
+
+    // FIXME: alias analysis of fir.load
+    // follow this common pattern:
+    // %ref = hlfir.designate %array(%index)
+    // %val = fir.load $ref
+    if (auto designate = accessedVal.getDefiningOp<hlfir::DesignateOp>()) {
+      if (designate.getMemref() == val)
+        return mlir::AliasResult::MustAlias;
+
+      // if the designate is into an array that might alias val
+      res = aliasAnalysis.alias(val, designate.getMemref());
+      if (!res.isNo())
+        return res;
+    }
+  }
+  return mlir::AliasResult::NoAlias;
+}
+
+std::optional<ElementalAssignBufferization::MatchInfo>
+ElementalAssignBufferization::findMatch(hlfir::ElementalOp elemental) {
+  mlir::Operation::user_range users = elemental->getUsers();
+  // the only uses of the elemental should be the assignment and the destroy
+  if (std::distance(users.begin(), users.end()) != 2) {
+    LLVM_DEBUG(llvm::dbgs() << "Too many uses of the elemental\n");
+    return std::nullopt;
+  }
+
+  MatchInfo match;
+  for (mlir::Operation *user : users)
+    mlir::TypeSwitch<mlir::Operation *, void>(user)
+        .Case([&](hlfir::AssignOp op) { match.assign = op; })
+        .Case([&](hlfir::DestroyOp op) { match.destroy = op; });
+
+  if (!match.assign || !match.destroy) {
+    LLVM_DEBUG(llvm::dbgs() << "Couldn't find assign or destroy\n");
+    return std::nullopt;
+  }
+
+  // the array is what the elemental is assigned into
+  // TODO: this could be extended to also allow hlfir.expr by first bufferizing
+  // the incoming expression
+  match.array = match.assign.getLhs();
+  mlir::Type arrayType = mlir::dyn_cast<fir::SequenceType>(
+      fir::unwrapPassByRefType(match.array.getType()));
+  if (!arrayType)
+    return std::nullopt;
+
+  // require that the array elements are trivial
+  // TODO: this is just to make the pass easier to think about. Not an inherent
+  // limitation
+  mlir::Type eleTy = hlfir::getFortranElementType(arrayType);
+  if (!fir::isa_trivial(eleTy))
+    return std::nullopt;
+
+  // the array must have the same shape as the elemental. CSE should have
+  // deduplicated the fir.shape operations where they are provably the same
+  // so we just have to check for the same ssa value
+  // TODO: add more ways of getting the shape of the array
+  mlir::Value arrayShape;
+  if (match.array.getDefiningOp())
+    arrayShape =
+        mlir::TypeSwitch<mlir::Operation *, mlir::Value>(
+            match.array.getDefiningOp())
+            .Case([](hlfir::DesignateOp designate) {
+              return designate.getShape();
+            })
+            .Case([](hlfir::DeclareOp declare) { return declare.getShape(); })
+            .Default([](mlir::Operation *) { return mlir::Value{}; });
+  if (!arrayShape) {
+    LLVM_DEBUG(llvm::dbgs() << "Can't get shape of " << match.array << " at "
+                            << elemental->getLoc() << "\n");
+    return std::nullopt;
+  }
+  if (arrayShape != elemental.getShape()) {
+    // f2018 10.2.1.2 (3) requires the lhs and rhs of an assignment to be
+    // conformable unless the lhs is an allocatable array. In HLFIR we can
+    // see this from the presence or absence of the realloc attribute on
+    // hlfir.assign. If it is not a realloc assignment, we can trust that
+    // the shapes do conform
+    if (match.assign.getRealloc())
+      return std::nullopt;
+  }
+
+  // the transformation wants to apply the elemental in a do-loop at the
+  // hlfir.assign, check there are no effects which make this unsafe
+
+  // keep track of any values written to in the elemental, as these can't be
+  // read from between the elemental and the assignment
+  // likewise, values read in the elemental cannot be written to between the
+  // elemental and the assign
+  mlir::SmallVector<mlir::Value, 1> notToBeAccessedBeforeAssign;
+  // any accesses to the array between the array and the assignment means it
+  // would be unsafe to move the elemental to the assignment
+  notToBeAccessedBeforeAssign.push_back(match.array);
+
+  // 1) side effects in the elemental body - it isn't sufficient to just look
+  // for ordered elementals because we also cannot support out of order reads
+  std::optional<mlir::SmallVector<mlir::MemoryEffects::EffectInstance>>
+      effects = getEffectsBetween(&elemental.getBody()->front(),
+                                  elemental.getBody()->getTerminator());
+  if (!effects) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "operation with unknown effects inside elemental\n");
+    return std::nullopt;
+  }
+  for (const mlir::MemoryEffects::EffectInstance &effect : *effects) {
+    mlir::AliasResult res = containsReadOrWriteEffectOn(effect, match.array);
+    if (res.isNo()) {
+      if (mlir::isa<mlir::MemoryEffects::Write, mlir::MemoryEffects::Read>(
+              effect.getEffect()))
+        if (effect.getValue())
+          notToBeAccessedBeforeAssign.push_back(effect.getValue());
+
+      // this is safe in the elemental
+      continue;
+    }
+
+    // don't allow any aliasing writes in the elemental
+    if (mlir::isa<mlir::MemoryEffects::Write>(effect.getEffect())) {
+      LLVM_DEBUG(llvm::dbgs() << "write inside the elemental body\n");
+      return std::nullopt;
+    }
+
+    // allow if and only if the reads are from the elemental indices, in order
+    // => each iteration doesn't read values written by other iterations
+    // don't allow reads from a 
diff erent value which may alias: fir alias
+    // analysis isn't precise enough to tell us if two aliasing arrays overlap
+    // exactly or only partially. If they overlap partially, a designate at the
+    // elemental indices could be accessing 
diff erent elements: e.g. we could
+    // designate two slices of the same array at 
diff erent start indexes. These
+    // two MustAlias but index 1 of one array isn't the same element as index 1
+    // of the other array.
+    if (!res.isPartial()) {
+      if (auto designate =
+              effect.getValue().getDefiningOp<hlfir::DesignateOp>()) {
+        if (designate.getMemref() != match.array) {
+          LLVM_DEBUG(llvm::dbgs() << "possible read conflict: " << designate
+                                  << " at " << elemental.getLoc() << "\n");
+          return std::nullopt;
+        }
+        auto indices = designate.getIndices();
+        auto elementalIndices = elemental.getIndices();
+        if (indices.size() != elementalIndices.size()) {
+          LLVM_DEBUG(llvm::dbgs() << "possible read conflict: " << designate
+                                  << " at " << elemental.getLoc() << "\n");
+          return std::nullopt;
+        }
+        if (std::equal(indices.begin(), indices.end(), elementalIndices.begin(),
+                       elementalIndices.end()))
+          continue;
+      }
+    }
+    LLVM_DEBUG(llvm::dbgs() << "diasllowed side-effect: " << effect.getValue()
+                            << " for " << elemental.getLoc() << "\n");
+    return std::nullopt;
+  }
+
+  // 2) look for conflicting effects between the elemental and the assignment
+  effects = getEffectsBetween(elemental->getNextNode(), match.assign);
+  if (!effects) {
+    LLVM_DEBUG(
+        llvm::dbgs()
+        << "operation with unknown effects between elemental and assign\n");
+    return std::nullopt;
+  }
+  for (const mlir::MemoryEffects::EffectInstance &effect : *effects) {
+    // not safe to access anything written in the elemental as this write
+    // will be moved to the assignment
+    for (mlir::Value val : notToBeAccessedBeforeAssign) {
+      mlir::AliasResult res = containsReadOrWriteEffectOn(effect, val);
+      if (!res.isNo()) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "diasllowed side-effect: " << effect.getValue() << " for "
+                   << elemental.getLoc() << "\n");
+        return std::nullopt;
+      }
+    }
+  }
+
+  return match;
+}
+
+mlir::LogicalResult ElementalAssignBufferization::matchAndRewrite(
+    hlfir::ElementalOp elemental, mlir::PatternRewriter &rewriter) const {
+  std::optional<MatchInfo> match = findMatch(elemental);
+  if (!match)
+    return rewriter.notifyMatchFailure(
+        elemental, "cannot prove safety of ElementalAssignBufferization");
+
+  mlir::Location loc = elemental->getLoc();
+  fir::FirOpBuilder builder(rewriter, elemental.getOperation());
+  auto extents = hlfir::getIndexExtents(loc, builder, elemental.getShape());
+
+  // create the loop at the assignment
+  builder.setInsertionPoint(match->assign);
+
+  // Generate a loop nest looping around the hlfir.elemental shape and clone
+  // hlfir.elemental region inside the inner loop
+  hlfir::LoopNest loopNest =
+      hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered());
+  builder.setInsertionPointToStart(loopNest.innerLoop.getBody());
+  auto yield = hlfir::inlineElementalOp(loc, builder, elemental,
+                                        loopNest.oneBasedIndices);
+  hlfir::Entity elementValue{yield.getElementValue()};
+  rewriter.eraseOp(yield);
+
+  // Assign the element value to the array element for this iteration.
+  auto arrayElement = hlfir::getElementAt(
+      loc, builder, hlfir::Entity{match->array}, loopNest.oneBasedIndices);
+  builder.create<hlfir::AssignOp>(
+      loc, elementValue, arrayElement, /*realloc=*/false,
+      /*keep_lhs_length_if_realloc=*/false, match->assign.getTemporaryLhs());
+
+  rewriter.eraseOp(match->assign);
+  rewriter.eraseOp(match->destroy);
+  rewriter.eraseOp(elemental);
+  return mlir::success();
+}
+
+class OptimizedBufferizationPass
+    : public hlfir::impl::OptimizedBufferizationBase<
+          OptimizedBufferizationPass> {
+public:
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+    mlir::MLIRContext *context = &getContext();
+
+    mlir::GreedyRewriteConfig config;
+    // Prevent the pattern driver from merging blocks
+    config.enableRegionSimplification = false;
+
+    mlir::RewritePatternSet patterns(context);
+    patterns.insert<ElementalAssignBufferization>(context);
+
+    if (mlir::failed(mlir::applyPatternsAndFoldGreedily(
+            func, std::move(patterns), config))) {
+      mlir::emitError(func.getLoc(),
+                      "failure in HLFIR optimized bufferization");
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> hlfir::createOptimizedBufferizationPass() {
+  return std::make_unique<OptimizedBufferizationPass>();
+}

diff  --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 4f0efb17545d42..0e82f7dfdedb44 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -20,6 +20,12 @@ func.func @_QQmain() {
 // PASSES-NEXT: 'func.func' Pipeline
 // PASSES-NEXT:   SimplifyHLFIRIntrinsics
 // PASSES-NEXT:   InlineElementals
+// PASSES-NEXT:   Canonicalizer
+// PASSES-NEXT:   CSE
+// PASSES-NEXT:    (S) 0 num-cse'd - Number of operations CSE'd
+// PASSES-NEXT:    (S) 0 num-dce'd - Number of operations DCE'd
+// PASSES-NEXT:   'func.func' Pipeline
+// PASSES-NEXT:    OptimizedBufferization
 // PASSES-NEXT:   LowerHLFIROrderedAssignments
 // PASSES-NEXT:   LowerHLFIRIntrinsics
 // PASSES-NEXT:   BufferizeHLFIR

diff  --git a/flang/test/HLFIR/opt-bufferization.fir b/flang/test/HLFIR/opt-bufferization.fir
new file mode 100644
index 00000000000000..73ca51c08e210c
--- /dev/null
+++ b/flang/test/HLFIR/opt-bufferization.fir
@@ -0,0 +1,798 @@
+// RUN: fir-opt --opt-bufferization %s | FileCheck %s
+
+// simplified example
+func.func @simple(%arg: !fir.ref<!fir.array<42xi32>>) {
+  %c42 = arith.constant 42 : index
+  %c1_i32 = arith.constant 1 : i32
+  %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+  %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+  ^bb0(%i: index):
+    %ref = hlfir.designate %array#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+    %val = fir.load %ref : !fir.ref<i32>
+    %sub = arith.subi %val, %c1_i32 : i32
+    hlfir.yield_element %sub : i32
+  }
+  hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+  hlfir.destroy %elemental : !hlfir.expr<42xi32>
+  return
+}
+// CHECK-LABEL:   func.func @simple(
+// CHECK-SAME:                      %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 42 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           fir.do_loop %[[VAL_6:.*]] = %[[VAL_1]] to %[[VAL_2]] step %[[VAL_1]] unordered {
+// CHECK:             %[[VAL_7:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_6]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+// CHECK:             %[[VAL_9:.*]] = arith.subi %[[VAL_8]], %[[VAL_3]] : i32
+// CHECK:             %[[VAL_10:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_6]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             hlfir.assign %[[VAL_9]] to %[[VAL_10]] : i32, !fir.ref<i32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// check we support reads that don't alias the transformed array
+func.func @read_no_alias(%arg: !fir.ref<!fir.array<42xi32>>, %arg1: !fir.ref<!fir.array<42xi32>>) {
+  %c42 = arith.constant 42 : index
+  %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+  %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %other:2 = hlfir.declare %arg1(%shape) {uniq_name = "other"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+  ^bb0(%i: index):
+    %ref = hlfir.designate %array#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+    %other_ref = hlfir.designate %other#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+    %val = fir.load %ref : !fir.ref<i32>
+    %other_val = fir.load %other_ref : !fir.ref<i32>
+    %sub = arith.subi %val, %other_val : i32
+    hlfir.yield_element %sub : i32
+  }
+  hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+  hlfir.destroy %elemental : !hlfir.expr<42xi32>
+  return
+}
+// CHECK-LABEL:   func.func @read_no_alias(
+// CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>,
+// CHECK-SAME:                             %[[VAL_1:.*]]: !fir.ref<!fir.array<42xi32>>) {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 42 : index
+// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_4]]) {uniq_name = "other"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           fir.do_loop %[[VAL_7:.*]] = %[[VAL_2]] to %[[VAL_3]] step %[[VAL_2]] unordered {
+// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_7]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_9:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_7]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
+// CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+// CHECK:             %[[VAL_12:.*]] = arith.subi %[[VAL_10]], %[[VAL_11]] : i32
+// CHECK:             %[[VAL_13:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_7]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             hlfir.assign %[[VAL_12]] to %[[VAL_13]] : i32, !fir.ref<i32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+
+// check we don't transform when there is another use of the elemental expr
+func.func @two_uses(%arg: !fir.ref<!fir.array<42xi32>>) -> i32 {
+  %c42 = arith.constant 42 : index
+  %c1_i32 = arith.constant 1 : i32
+  %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+  %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+  ^bb0(%i: index):
+    %ref = hlfir.designate %array#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+    %val = fir.load %ref : !fir.ref<i32>
+    %sub = arith.subi %val, %c1_i32 : i32
+    hlfir.yield_element %sub : i32
+  }
+  hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+  %bad = hlfir.apply %elemental, %c42 : (!hlfir.expr<42xi32>, index) -> i32
+  hlfir.destroy %elemental : !hlfir.expr<42xi32>
+  return %bad : i32
+}
+// CHECK-LABEL:   func.func @two_uses(
+// CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>) -> i32 {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 42 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           %[[VAL_5:.*]] = hlfir.elemental %[[VAL_3]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+// CHECK:           ^bb0(%[[VAL_6:.*]]: index):
+// CHECK:             %[[VAL_7:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_6]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+// CHECK:             %[[VAL_9:.*]] = arith.subi %[[VAL_8]], %[[VAL_2]] : i32
+// CHECK:             hlfir.yield_element %[[VAL_9]] : i32
+// CHECK:           }
+// CHECK:           hlfir.assign %[[VAL_10:.*]] to %[[VAL_4]]#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+// CHECK:           %[[VAL_11:.*]] = hlfir.apply %[[VAL_10]], %[[VAL_1]] : (!hlfir.expr<42xi32>, index) -> i32
+// CHECK:           hlfir.destroy %[[VAL_10]] : !hlfir.expr<42xi32>
+// CHECK:           return %[[VAL_11]] : i32
+// CHECK:         }
+
+// two dimensional array
+func.func @two_dimensional(%arg: !fir.ref<!fir.array<42x42xi32>>) {
+  %c42 = arith.constant 42 : index
+  %c1_i32 = arith.constant 1 : i32
+  %shape = fir.shape %c42, %c42 : (index, index) -> !fir.shape<2>
+  %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42x42xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<42x42xi32>>, !fir.ref<!fir.array<42x42xi32>>)
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<2>) -> !hlfir.expr<42x42xi32> {
+  ^bb0(%i: index, %j: index):
+    %ref = hlfir.designate %array#0 (%i, %j) : (!fir.ref<!fir.array<42x42xi32>>, index, index) -> !fir.ref<i32>
+    %val = fir.load %ref : !fir.ref<i32>
+    %sub = arith.subi %val, %c1_i32 : i32
+    hlfir.yield_element %sub : i32
+  }
+  hlfir.assign %elemental to %array#0 : !hlfir.expr<42x42xi32>, !fir.ref<!fir.array<42x42xi32>>
+  hlfir.destroy %elemental : !hlfir.expr<42x42xi32>
+  return
+}
+// CHECK-LABEL:   func.func @two_dimensional(
+// CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<!fir.array<42x42xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 42 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]], %[[VAL_2]] : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42x42xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<42x42xi32>>, !fir.ref<!fir.array<42x42xi32>>)
+// CHECK:           fir.do_loop %[[VAL_6:.*]] = %[[VAL_1]] to %[[VAL_2]] step %[[VAL_1]] unordered {
+// CHECK:             fir.do_loop %[[VAL_7:.*]] = %[[VAL_1]] to %[[VAL_2]] step %[[VAL_1]] unordered {
+// CHECK:               %[[VAL_8:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_7]], %[[VAL_6]])  : (!fir.ref<!fir.array<42x42xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
+// CHECK:               %[[VAL_10:.*]] = arith.subi %[[VAL_9]], %[[VAL_3]] : i32
+// CHECK:               %[[VAL_11:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_7]], %[[VAL_6]])  : (!fir.ref<!fir.array<42x42xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_10]] to %[[VAL_11]] : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// don't transform when elements are accessessed out of order (e.g. transposed)
+func.func @transposed(%arg: !fir.ref<!fir.array<42x42xi32>>) {
+  %c42 = arith.constant 42 : index
+  %c1_i32 = arith.constant 1 : i32
+  %shape = fir.shape %c42, %c42 : (index, index) -> !fir.shape<2>
+  %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42x42xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<42x42xi32>>, !fir.ref<!fir.array<42x42xi32>>)
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<2>) -> !hlfir.expr<42x42xi32> {
+  ^bb0(%i: index, %j: index):
+    %ref = hlfir.designate %array#0 (%j, %i) : (!fir.ref<!fir.array<42x42xi32>>, index, index) -> !fir.ref<i32>
+    %val = fir.load %ref : !fir.ref<i32>
+    %sub = arith.subi %val, %c1_i32 : i32
+    hlfir.yield_element %sub : i32
+  }
+  hlfir.assign %elemental to %array#0 : !hlfir.expr<42x42xi32>, !fir.ref<!fir.array<42x42xi32>>
+  hlfir.destroy %elemental : !hlfir.expr<42x42xi32>
+  return
+}
+// CHECK-LABEL:   func.func @transposed(
+// CHECK-SAME:                          %[[VAL_0:.*]]: !fir.ref<!fir.array<42x42xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 42 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]], %[[VAL_1]] : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42x42xi32>>, !fir.shape<2>) -> (!fir.ref<!fir.array<42x42xi32>>, !fir.ref<!fir.array<42x42xi32>>)
+// CHECK:           %[[VAL_5:.*]] = hlfir.elemental %[[VAL_3]] unordered : (!fir.shape<2>) -> !hlfir.expr<42x42xi32> {
+// CHECK:           ^bb0(%[[VAL_6:.*]]: index, %[[VAL_7:.*]]: index):
+// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_7]], %[[VAL_6]])  : (!fir.ref<!fir.array<42x42xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
+// CHECK:             %[[VAL_10:.*]] = arith.subi %[[VAL_9]], %[[VAL_2]] : i32
+// CHECK:             hlfir.yield_element %[[VAL_10]] : i32
+// CHECK:           }
+// CHECK:           hlfir.assign %[[VAL_11:.*]] to %[[VAL_4]]#0 : !hlfir.expr<42x42xi32>, !fir.ref<!fir.array<42x42xi32>>
+// CHECK:           hlfir.destroy %[[VAL_11]] : !hlfir.expr<42x42xi32>
+// CHECK:           return
+// CHECK:         }
+
+// don't transform when there's an operation with unknown effects
+func.func @unknown(%arg: !fir.ref<!fir.array<42xi32>>) {
+  %c42 = arith.constant 42 : index
+  %c1_i32 = arith.constant 1 : i32
+  %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+  %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+  ^bb0(%i: index):
+    %ref = hlfir.designate %array#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+    %val = fir.load %ref : !fir.ref<i32>
+    %sub = arith.subi %val, %c1_i32 : i32
+    %res = fir.call @impure(%sub) : (i32) -> i32
+    hlfir.yield_element %res : i32
+  }
+  hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+  hlfir.destroy %elemental : !hlfir.expr<42xi32>
+  return
+}
+// CHECK-LABEL:   func.func @unknown(
+// CHECK-SAME:                       %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 42 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           %[[VAL_5:.*]] = hlfir.elemental %[[VAL_3]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+// CHECK:           ^bb0(%[[VAL_6:.*]]: index):
+// CHECK:             %[[VAL_7:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_6]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+// CHECK:             %[[VAL_9:.*]] = arith.subi %[[VAL_8]], %[[VAL_2]] : i32
+// CHECK:             %[[VAL_10:.*]] = fir.call @impure(%[[VAL_9]]) : (i32) -> i32
+// CHECK:             hlfir.yield_element %[[VAL_10]] : i32
+// CHECK:           }
+// CHECK:           hlfir.assign %[[VAL_11:.*]] to %[[VAL_4]]#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+// CHECK:           hlfir.destroy %[[VAL_11]] : !hlfir.expr<42xi32>
+// CHECK:           return
+// CHECK:         }
+
+// don't transform when there's an operation with write effects
+func.func @write(%arg: !fir.ref<!fir.array<42xi32>>, %arg1: !fir.ref<!fir.array<42xi32>>) {
+  %alloc = fir.alloca i32
+  %c42 = arith.constant 42 : index
+  %c1_i32 = arith.constant 1 : i32
+  %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+  %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %array2:2 = hlfir.declare %arg1(%shape) {uniq_name = "array2"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+  ^bb0(%i: index):
+    hlfir.assign %array2#0 to %array#0 : !fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>
+    %ref = hlfir.designate %array#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+    %val = fir.load %ref : !fir.ref<i32>
+    %sub = arith.subi %val, %c1_i32 : i32
+    fir.store %sub to %alloc : !fir.ref<i32>
+    hlfir.yield_element %sub : i32
+  }
+  hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+  hlfir.destroy %elemental : !hlfir.expr<42xi32>
+  return
+}
+// CHECK-LABEL:   func.func @write(
+// CHECK-SAME:                     %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>,
+// CHECK-SAME:                     %[[ARG_1:.*]]: !fir.ref<!fir.array<42xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 42 : index
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32
+// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           %[[VAL_5B:.*]]:2 = hlfir.declare %[[ARG_1]](%[[VAL_4]]) {uniq_name = "array2"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           %[[VAL_6:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: index):
+// CHECK:             hlfir.assign %[[VAL_5B]]#0 to %[[VAL_5]]#0 : !fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>
+// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_7]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
+// CHECK:             %[[VAL_10:.*]] = arith.subi %[[VAL_9]], %[[VAL_1]] : i32
+// CHECK:             fir.store %[[VAL_10]] to %[[VAL_3]] : !fir.ref<i32>
+// CHECK:             hlfir.yield_element %[[VAL_10]] : i32
+// CHECK:           }
+// CHECK:           hlfir.assign %[[VAL_11:.*]] to %[[VAL_5]]#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+// CHECK:           hlfir.destroy %[[VAL_11]] : !hlfir.expr<42xi32>
+// CHECK:           return
+// CHECK:         }
+
+// don't transform when there is an aliasing read
+func.func @readAlias(%arg: !fir.ref<!fir.array<42xi32>>) {
+  %c42 = arith.constant 42 : index
+  %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+  %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %arrayDup:2 = hlfir.declare %arg(%shape) {uniq_name = "arrayDup"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+  ^bb0(%i: index):
+    %ref = hlfir.designate %array#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+    %refDup = hlfir.designate %arrayDup#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+    %val = fir.load %ref : !fir.ref<i32>
+    %valDup = fir.load %refDup : !fir.ref<i32>
+    %sub = arith.subi %val, %valDup : i32
+    hlfir.yield_element %sub : i32
+  }
+  hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+  hlfir.destroy %elemental : !hlfir.expr<42xi32>
+  return
+}
+// CHECK-LABEL:   func.func @readAlias(
+// CHECK-SAME:                         %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 42 : index
+// CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) {uniq_name = "arrayDup"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           %[[VAL_5:.*]] = hlfir.elemental %[[VAL_2]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+// CHECK:           ^bb0(%[[VAL_6:.*]]: index):
+// CHECK:             %[[VAL_7:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_6]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_6]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+// CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
+// CHECK:             %[[VAL_11:.*]] = arith.subi %[[VAL_9]], %[[VAL_10]] : i32
+// CHECK:             hlfir.yield_element %[[VAL_11]] : i32
+// CHECK:           }
+// CHECK:           hlfir.assign %[[VAL_12:.*]] to %[[VAL_3]]#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+// CHECK:           hlfir.destroy %[[VAL_12]] : !hlfir.expr<42xi32>
+// CHECK:           return
+// CHECK:         }
+
+// don't transform when moving the elemental to the assignment could change the results of a read
+func.func @write_conflict(%arg: !fir.ref<!fir.array<42xi32>>) -> index {
+  %alloc = fir.alloca index
+  %c42 = arith.constant 42 : index
+  %c1_i32 = arith.constant 1 : i32
+  %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+  %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+  ^bb0(%i: index):
+    %ref = hlfir.designate %array#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+    %val = fir.load %ref : !fir.ref<i32>
+    %sub = arith.subi %val, %c1_i32 : i32
+    // write in elemental:
+    hlfir.assign %i to %alloc : index, !fir.ref<index>
+    hlfir.yield_element %sub : i32
+  }
+  // conflicting read:
+  %conflict = fir.load %alloc : !fir.ref<index>
+  hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+  hlfir.destroy %elemental : !hlfir.expr<42xi32>
+  return %conflict : index
+}
+// CHECK-LABEL:   func.func @write_conflict(
+// CHECK-SAME:                              %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>) -> index {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 42 : index
+// CHECK:           %[[VAL_3:.*]] = fir.alloca index
+// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           %[[VAL_6:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: index):
+// CHECK:             %[[VAL_8:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_7]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
+// CHECK:             %[[VAL_10:.*]] = arith.subi %[[VAL_9]], %[[VAL_1]] : i32
+// CHECK:             hlfir.assign %[[VAL_7]] to %[[VAL_3]] : index, !fir.ref<index>
+// CHECK:             hlfir.yield_element %[[VAL_10]] : i32
+// CHECK:           }
+// CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_3]] : !fir.ref<index>
+// CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_5]]#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+// CHECK:           hlfir.destroy %[[VAL_6]] : !hlfir.expr<42xi32>
+// CHECK:           return %[[VAL_11]] : index
+// CHECK:         }
+
+// don't transform when moving the elemental to the assignment could change the results of a read #2
+func.func @read_conflict(%arg: !fir.ref<!fir.array<42xi32>>) {
+  %alloc = fir.alloca i32
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  hlfir.assign %c1_i32 to %alloc : i32, !fir.ref<i32>
+  %c42 = arith.constant 42 : index
+  %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+  %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+  ^bb0(%i: index):
+    %ref = hlfir.designate %array#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+    %val = fir.load %ref : !fir.ref<i32>
+    // conflicting read:
+    %read = fir.load %alloc : !fir.ref<i32>
+    %sub = arith.subi %val, %read : i32
+    hlfir.yield_element %sub : i32
+  }
+  // conflicting write:
+  hlfir.assign %c0_i32 to %alloc : i32, !fir.ref<i32>
+  hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+  hlfir.destroy %elemental : !hlfir.expr<42xi32>
+  return
+}
+// CHECK-LABEL:   func.func @read_conflict(
+// CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 42 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_4:.*]] = fir.alloca i32
+// CHECK:           hlfir.assign %[[VAL_2]] to %[[VAL_4]] : i32, !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:           %[[VAL_7:.*]] = hlfir.elemental %[[VAL_5]] unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+// CHECK:           ^bb0(%[[VAL_8:.*]]: index):
+// CHECK:             %[[VAL_9:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_8]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+// CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
+// CHECK:             %[[VAL_12:.*]] = arith.subi %[[VAL_10]], %[[VAL_11]] : i32
+// CHECK:             hlfir.yield_element %[[VAL_12]] : i32
+// CHECK:           }
+// CHECK:           hlfir.assign %[[VAL_3]] to %[[VAL_4]] : i32, !fir.ref<i32>
+// CHECK:           hlfir.assign %[[VAL_7]] to %[[VAL_6]]#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+// CHECK:           hlfir.destroy %[[VAL_7]] : !hlfir.expr<42xi32>
+// CHECK:           return
+// CHECK:         }
+
+fir.global @_QMmEblock : !fir.array<9x9x9xi32> {
+  %0 = fir.undefined !fir.array<9x9x9xi32>
+  fir.has_value %0 : !fir.array<9x9x9xi32>
+}
+fir.global @_QMmECr constant : i32 {
+  %c9_i32 = arith.constant 9 : i32
+  fir.has_value %c9_i32 : i32
+}
+
+// does it work for the intended case?
+func.func @_QMmPrepro(%arg0: !fir.ref<i32> {fir.bindc_name = "imin"}, %arg1: !fir.ref<i32> {fir.bindc_name = "imax"}, %arg2: !fir.ref<i32> {fir.bindc_name = "row"}) {
+  %c10_i32 = arith.constant 10 : i32
+  %c8 = arith.constant 8 : index
+  %c2 = arith.constant 2 : index
+  %c1 = arith.constant 1 : index
+  %c9 = arith.constant 9 : index
+  %0 = fir.address_of(@_QMmEblock) : !fir.ref<!fir.array<9x9x9xi32>>
+  %1 = fir.shape %c9, %c9, %c9 : (index, index, index) -> !fir.shape<3>
+  %2:2 = hlfir.declare %0(%1) {uniq_name = "_QMmEblock"} : (!fir.ref<!fir.array<9x9x9xi32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<9x9x9xi32>>, !fir.ref<!fir.array<9x9x9xi32>>)
+  %3 = fir.address_of(@_QMmECr) : !fir.ref<i32>
+  %4:2 = hlfir.declare %3 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QMmECr"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5 = fir.alloca i32 {bindc_name = "i1", uniq_name = "_QMmFreproEi1"}
+  %6:2 = hlfir.declare %5 {uniq_name = "_QMmFreproEi1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %7:2 = hlfir.declare %arg1 {uniq_name = "_QMmFreproEimax"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %8:2 = hlfir.declare %arg0 {uniq_name = "_QMmFreproEimin"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %9:2 = hlfir.declare %arg2 {uniq_name = "_QMmFreproErow"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %10 = fir.load %8#0 : !fir.ref<i32>
+  %11 = fir.convert %10 : (i32) -> index
+  %12 = fir.load %7#0 : !fir.ref<i32>
+  %13 = fir.convert %12 : (i32) -> index
+  %14 = fir.convert %11 : (index) -> i32
+  %15:2 = fir.do_loop %arg3 = %11 to %13 step %c1 iter_args(%arg4 = %14) -> (index, i32) {
+    fir.store %arg4 to %6#1 : !fir.ref<i32>
+    %16 = fir.load %9#0 : !fir.ref<i32>
+    %17 = fir.convert %16 : (i32) -> i64
+    %18 = fir.load %6#0 : !fir.ref<i32>
+    %19 = fir.convert %18 : (i32) -> i64
+    %20 = fir.shape %c8 : (index) -> !fir.shape<1>
+    %21 = hlfir.designate %2#0 (%17, %c2:%c9:%c1, %19)  shape %20 : (!fir.ref<!fir.array<9x9x9xi32>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<8xi32>>
+    %22 = hlfir.elemental %20 unordered : (!fir.shape<1>) -> !hlfir.expr<8xi32> {
+    ^bb0(%arg5: index):
+      %27 = hlfir.designate %21 (%arg5)  : (!fir.box<!fir.array<8xi32>>, index) -> !fir.ref<i32>
+      %28 = fir.load %27 : !fir.ref<i32>
+      %29 = arith.subi %28, %c10_i32 : i32
+      hlfir.yield_element %29 : i32
+    }
+    hlfir.assign %22 to %21 : !hlfir.expr<8xi32>, !fir.box<!fir.array<8xi32>>
+    hlfir.destroy %22 : !hlfir.expr<8xi32>
+    %23 = arith.addi %arg3, %c1 : index
+    %24 = fir.convert %c1 : (index) -> i32
+    %25 = fir.load %6#1 : !fir.ref<i32>
+    %26 = arith.addi %25, %24 : i32
+    fir.result %23, %26 : index, i32
+  }
+  fir.store %15#1 to %6#1 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QMmPrepro(
+// CHECK-SAME:                          %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "imin"},
+// CHECK-SAME:                          %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "imax"},
+// CHECK-SAME:                          %[[VAL_2:.*]]: !fir.ref<i32> {fir.bindc_name = "row"}) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_4:.*]] = arith.constant 8 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 9 : index
+// CHECK:           %[[VAL_8:.*]] = fir.address_of(@_QMmEblock) : !fir.ref<!fir.array<9x9x9xi32>>
+// CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_7]], %[[VAL_7]], %[[VAL_7]] : (index, index, index) -> !fir.shape<3>
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]](%[[VAL_9]]) {uniq_name = "_QMmEblock"} : (!fir.ref<!fir.array<9x9x9xi32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<9x9x9xi32>>, !fir.ref<!fir.array<9x9x9xi32>>)
+// CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QMmECr) : !fir.ref<i32>
+// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QMmECr"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_13:.*]] = fir.alloca i32 {bindc_name = "i1", uniq_name = "_QMmFreproEi1"}
+// CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]] {uniq_name = "_QMmFreproEi1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QMmFreproEimax"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QMmFreproEimin"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QMmFreproErow"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_18:.*]] = fir.load %[[VAL_16]]#0 : !fir.ref<i32>
+// CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> index
+// CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+// CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> index
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_19]] : (index) -> i32
+// CHECK:           %[[VAL_23:.*]]:2 = fir.do_loop %[[VAL_24:.*]] = %[[VAL_19]] to %[[VAL_21]] step %[[VAL_6]] iter_args(%[[VAL_25:.*]] = %[[VAL_22]]) -> (index, i32) {
+// CHECK:             fir.store %[[VAL_25]] to %[[VAL_14]]#1 : !fir.ref<i32>
+// CHECK:             %[[VAL_26:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<i32>
+// CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (i32) -> i64
+// CHECK:             %[[VAL_28:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+// CHECK:             %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i32) -> i64
+// CHECK:             %[[VAL_30:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+// CHECK:             %[[VAL_31:.*]] = hlfir.designate %[[VAL_10]]#0 (%[[VAL_27]], %[[VAL_5]]:%[[VAL_7]]:%[[VAL_6]], %[[VAL_29]])  shape %[[VAL_30]] : (!fir.ref<!fir.array<9x9x9xi32>>, i64, index, index, index, i64, !fir.shape<1>) -> !fir.box<!fir.array<8xi32>>
+// CHECK:             fir.do_loop %[[VAL_32:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_6]] unordered {
+// CHECK:               %[[VAL_33:.*]] = hlfir.designate %[[VAL_31]] (%[[VAL_32]])  : (!fir.box<!fir.array<8xi32>>, index) -> !fir.ref<i32>
+// CHECK:               %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref<i32>
+// CHECK:               %[[VAL_35:.*]] = arith.subi %[[VAL_34]], %[[VAL_3]] : i32
+// CHECK:               %[[VAL_36:.*]] = hlfir.designate %[[VAL_31]] (%[[VAL_32]])  : (!fir.box<!fir.array<8xi32>>, index) -> !fir.ref<i32>
+// CHECK:               hlfir.assign %[[VAL_35]] to %[[VAL_36]] : i32, !fir.ref<i32>
+// CHECK:             }
+// CHECK:             %[[VAL_37:.*]] = arith.addi %[[VAL_24]], %[[VAL_6]] : index
+// CHECK:             %[[VAL_38:.*]] = fir.convert %[[VAL_6]] : (index) -> i32
+// CHECK:             %[[VAL_39:.*]] = fir.load %[[VAL_14]]#1 : !fir.ref<i32>
+// CHECK:             %[[VAL_40:.*]] = arith.addi %[[VAL_39]], %[[VAL_38]] : i32
+// CHECK:             fir.result %[[VAL_37]], %[[VAL_40]] : index, i32
+// CHECK:           }
+// CHECK:           fir.store %[[VAL_41:.*]]#1 to %[[VAL_14]]#1 : !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+
+// support for z = x + y
+func.func @other_reads(%z_arg: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "z"}, %x_arg: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}, %y_arg: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y"}) {
+  %c0 = arith.constant 0 : index
+  %box_dims:3 = fir.box_dims %z_arg, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+  // ignore lower bound etc
+  %shape = fir.shape %box_dims#1 : (index) -> !fir.shape<1>
+
+  %z:2 = hlfir.declare %z_arg(%shape) {uniq_name = "z"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %x:2 = hlfir.declare %x_arg(%shape) {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %y:2 = hlfir.declare %y_arg(%shape) {uniq_name = "y"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%i: index):
+    %x_ref = hlfir.designate %x#0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %x_val = fir.load %x_ref : !fir.ref<f32>
+    %y_ref = hlfir.designate %y#0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %y_val = fir.load %y_ref : !fir.ref<f32>
+    %add = arith.addf %x_val, %y_val : f32
+    hlfir.yield_element %add : f32
+  }
+  hlfir.assign %elemental to %z#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+  hlfir.destroy %elemental : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL:   func.func @other_reads(
+// CHECK-SAME:                      %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "z"},
+// CHECK-SAME:                      %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"},
+// CHECK-SAME:                      %[[VAL_2:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y"}) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "z"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_6]]) {uniq_name = "y"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           fir.do_loop %[[VAL_10:.*]] = %[[VAL_3]] to %[[VAL_5]]#1 step %[[VAL_3]] unordered {
+// CHECK:             %[[VAL_11:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_10]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_11]] : !fir.ref<f32>
+// CHECK:             %[[VAL_13:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_10]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:             %[[VAL_14:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
+// CHECK:             %[[VAL_15:.*]] = arith.addf %[[VAL_12]], %[[VAL_14]] : f32
+// CHECK:             %[[VAL_16:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_10]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:             hlfir.assign %[[VAL_15]] to %[[VAL_16]] : f32, !fir.ref<f32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// support for z = x + y, when z is declared after the elemental
+func.func @other_reads_late_decl(%z_arg: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "z"}, %x_arg: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}, %y_arg: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y"}) {
+  %c0 = arith.constant 0 : index
+  %box_dims:3 = fir.box_dims %z_arg, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+  // ignore lower bound etc
+  %shape = fir.shape %box_dims#1 : (index) -> !fir.shape<1>
+
+  %x:2 = hlfir.declare %x_arg(%shape) {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %y:2 = hlfir.declare %y_arg(%shape) {uniq_name = "y"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+
+  %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%i: index):
+    %x_ref = hlfir.designate %x#0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %x_val = fir.load %x_ref : !fir.ref<f32>
+    %y_ref = hlfir.designate %y#0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %y_val = fir.load %y_ref : !fir.ref<f32>
+    %add = arith.addf %x_val, %y_val : f32
+    hlfir.yield_element %add : f32
+  }
+  %z:2 = hlfir.declare %z_arg(%shape) {uniq_name = "z"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  hlfir.assign %elemental to %z#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+  hlfir.destroy %elemental : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL:   func.func @other_reads_late_decl(
+// CHECK-SAME:                      %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "z"},
+// CHECK-SAME:                      %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"},
+// CHECK-SAME:                      %[[VAL_2:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y"}) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
+// CHECK-DAG:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "z"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK-DAG:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_6]]) {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK-DAG:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_6]]) {uniq_name = "y"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           fir.do_loop %[[VAL_10:.*]] = %[[VAL_3]] to %[[VAL_5]]#1 step %[[VAL_3]] unordered {
+// CHECK:             %[[VAL_11:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_10]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_11]] : !fir.ref<f32>
+// CHECK:             %[[VAL_13:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_10]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:             %[[VAL_14:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
+// CHECK:             %[[VAL_15:.*]] = arith.addf %[[VAL_12]], %[[VAL_14]] : f32
+// CHECK:             %[[VAL_16:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_10]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:             hlfir.assign %[[VAL_15]] to %[[VAL_16]] : f32, !fir.ref<f32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// support for z = x + y, when z is declared after the elemental and the shape values are 
diff erent
+func.func @other_reads_odd_shape(%z_arg: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "z"}, %x_arg: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}, %y_arg: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y"}) {
+  %c0 = arith.constant 0 : index
+  %x_box_dims:3 = fir.box_dims %x_arg, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+  // ignore lower bound etc
+  %xy_shape = fir.shape %x_box_dims#1 : (index) -> !fir.shape<1>
+
+  %x:2 = hlfir.declare %x_arg(%xy_shape) {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %y:2 = hlfir.declare %y_arg(%xy_shape) {uniq_name = "y"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+
+  %elemental = hlfir.elemental %xy_shape unordered : (!fir.shape<1>) -> !hlfir.expr<?xf32> {
+  ^bb0(%i: index):
+    %x_ref = hlfir.designate %x#0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %x_val = fir.load %x_ref : !fir.ref<f32>
+    %y_ref = hlfir.designate %y#0 (%i) : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+    %y_val = fir.load %y_ref : !fir.ref<f32>
+    %add = arith.addf %x_val, %y_val : f32
+    hlfir.yield_element %add : f32
+  }
+
+  %z_box_dims:3 = fir.box_dims %z_arg, %c0 : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+  // ignore lower bound etc
+  %z_shape = fir.shape %z_box_dims#1 : (index) -> !fir.shape<1>
+  %z:2 = hlfir.declare %z_arg(%z_shape) {uniq_name = "z"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  // assume the length of z is 10 longer than the length of x
+  %c10 = arith.constant 10 : index
+  %c1 = arith.constant 1 : index
+  %slice_extent = arith.addi %c10, %x_box_dims#1 : index
+  %slice_shape = fir.shape %slice_extent : (index) -> !fir.shape<1>
+  %z_slice = hlfir.designate %z#0 (%c10:%slice_extent:%c1) shape %slice_shape : (!fir.box<!fir.array<?xf32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+
+  hlfir.assign %elemental to %z_slice : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+  hlfir.destroy %elemental : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL: func.func @other_reads_odd_shape(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "z"},
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"},
+// CHECK-SAME:                                   %[[VAL_2:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y"}) {
+// CHECK:         %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:         %[[VAL_4:.*]] = arith.constant 10 : index
+// CHECK:         %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:         %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// CHECK:         %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:         %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_7]]) {uniq_name = "x"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:         %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]](%[[VAL_7]]) {uniq_name = "y"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:         %[[VAL_10:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
+// CHECK:         %[[VAL_11:.*]] = fir.shape %[[VAL_10]]#1 : (index) -> !fir.shape<1>
+// CHECK:         %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_11]]) {uniq_name = "z"} : (!fir.box<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:         %[[VAL_13:.*]] = arith.addi %[[VAL_6]]#1, %[[VAL_4]] : index
+// CHECK:         %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
+// CHECK:         %[[VAL_15:.*]] = hlfir.designate %[[VAL_12]]#0 (%[[VAL_4]]:%[[VAL_13]]:%[[VAL_3]])  shape %[[VAL_14]] : (!fir.box<!fir.array<?xf32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xf32>>
+// CHECK:         fir.do_loop %[[VAL_16:.*]] = %[[VAL_3]] to %[[VAL_6]]#1 step %[[VAL_3]] unordered {
+// CHECK:           %[[VAL_17:.*]] = hlfir.designate %[[VAL_8]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:           %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<f32>
+// CHECK:           %[[VAL_19:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<f32>
+// CHECK:           %[[VAL_21:.*]] = arith.addf %[[VAL_18]], %[[VAL_20]] : f32
+// CHECK:           %[[VAL_22:.*]] = hlfir.designate %[[VAL_15]] (%[[VAL_16]])  : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+// CHECK:           hlfir.assign %[[VAL_21]] to %[[VAL_22]] : f32, !fir.ref<f32>
+// CHECK:         }
+// CHECK:         return
+// CHECK:       }
+
+// full test from intended code samplemodule attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "aarch64-unknown-linux-gnu"} {
+func.func @_QPddx(%arg0: !fir.box<!fir.array<?x?xf64>> {fir.bindc_name = "array"}) -> !fir.array<?x?xf64> {
+  %c-1 = arith.constant -1 : index
+  %c-2 = arith.constant -2 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c2 = arith.constant 2 : index
+  %c2_i32 = arith.constant 2 : i32
+  %c3 = arith.constant 3 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0:2 = hlfir.declare %arg0 {uniq_name = "_QFddxEarray"} : (!fir.box<!fir.array<?x?xf64>>) -> (!fir.box<!fir.array<?x?xf64>>, !fir.box<!fir.array<?x?xf64>>)
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFddxEi"}
+  %2:2 = hlfir.declare %1 {uniq_name = "_QFddxEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %3 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFddxEj"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFddxEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  %5:3 = fir.box_dims %0#0, %c0 : (!fir.box<!fir.array<?x?xf64>>, index) -> (index, index, index)
+  %6 = fir.convert %5#1 : (index) -> i64
+  %7 = fir.convert %6 : (i64) -> index
+  %8 = arith.cmpi sgt, %7, %c0 : index
+  %9 = arith.select %8, %7, %c0 : index
+  %10:3 = fir.box_dims %0#0, %c1 : (!fir.box<!fir.array<?x?xf64>>, index) -> (index, index, index)
+  %11 = fir.convert %10#1 : (index) -> i64
+  %12 = fir.convert %11 : (i64) -> index
+  %13 = arith.cmpi sgt, %12, %c0 : index
+  %14 = arith.select %13, %12, %c0 : index
+  %15 = fir.alloca !fir.array<?x?xf64>, %9, %14 {bindc_name = "ddx", uniq_name = "_QFddxEddx"}
+  %16 = fir.shape %9, %14 : (index, index) -> !fir.shape<2>
+  %17:2 = hlfir.declare %15(%16) {uniq_name = "_QFddxEddx"} : (!fir.ref<!fir.array<?x?xf64>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf64>>, !fir.ref<!fir.array<?x?xf64>>)
+  %18 = fir.convert %5#1 : (index) -> i32
+  hlfir.assign %18 to %2#0 : i32, !fir.ref<i32>
+  %19 = fir.convert %10#1 : (index) -> i32
+  hlfir.assign %19 to %4#0 : i32, !fir.ref<i32>
+  %20 = fir.load %2#0 : !fir.ref<i32>
+  %21 = fir.convert %20 : (i32) -> index
+  %22 = arith.addi %21, %c-2 : index
+  %23 = arith.cmpi sgt, %22, %c0 : index
+  %24 = arith.select %23, %22, %c0 : index
+  %25 = fir.load %4#0 : !fir.ref<i32>
+  %26 = fir.convert %25 : (i32) -> index
+  %27 = arith.cmpi sgt, %26, %c0 : index
+  %28 = arith.select %27, %26, %c0 : index
+  %29 = fir.shape %24, %28 : (index, index) -> !fir.shape<2>
+  %30 = hlfir.designate %0#0 (%c3:%21:%c1, %c1:%26:%c1)  shape %29 : (!fir.box<!fir.array<?x?xf64>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf64>>
+  %31 = arith.subi %20, %c2_i32 : i32
+  %32 = fir.convert %31 : (i32) -> index
+  %33 = arith.cmpi sgt, %32, %c0 : index
+  %34 = arith.select %33, %32, %c0 : index
+  %35 = fir.shape %34, %28 : (index, index) -> !fir.shape<2>
+  %36 = hlfir.designate %0#0 (%c1:%32:%c1, %c1:%26:%c1)  shape %35 : (!fir.box<!fir.array<?x?xf64>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf64>>
+  %37 = hlfir.elemental %29 unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xf64> {
+  ^bb0(%arg1: index, %arg2: index):
+    %46 = hlfir.designate %30 (%arg1, %arg2)  : (!fir.box<!fir.array<?x?xf64>>, index, index) -> !fir.ref<f64>
+    %47 = hlfir.designate %36 (%arg1, %arg2)  : (!fir.box<!fir.array<?x?xf64>>, index, index) -> !fir.ref<f64>
+    %48 = fir.load %46 : !fir.ref<f64>
+    %49 = fir.load %47 : !fir.ref<f64>
+    %50 = arith.subf %48, %49 fastmath<contract> : f64
+    hlfir.yield_element %50 : f64
+  }
+  %38 = arith.subi %20, %c1_i32 : i32
+  %39 = fir.convert %38 : (i32) -> index
+  %40 = arith.addi %39, %c-1 : index
+  %41 = arith.cmpi sgt, %40, %c0 : index
+  %42 = arith.select %41, %40, %c0 : index
+  %43 = fir.shape %42, %28 : (index, index) -> !fir.shape<2>
+  %44 = hlfir.designate %17#0 (%c2:%39:%c1, %c1:%26:%c1)  shape %43 : (!fir.box<!fir.array<?x?xf64>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf64>>
+  hlfir.assign %37 to %44 : !hlfir.expr<?x?xf64>, !fir.box<!fir.array<?x?xf64>>
+  hlfir.destroy %37 : !hlfir.expr<?x?xf64>
+  %45 = fir.load %17#1 : !fir.ref<!fir.array<?x?xf64>>
+  return %45 : !fir.array<?x?xf64>
+}
+// CHECK-LABEL:   func.func @_QPddx(
+// CHECK-SAME:                      %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xf64>> {fir.bindc_name = "array"}) -> !fir.array<?x?xf64> {
+// CHECK:           %[[VAL_1:.*]] = arith.constant -1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant -2 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 2 : i32
+// CHECK:           %[[VAL_6:.*]] = arith.constant 3 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFddxEarray"} : (!fir.box<!fir.array<?x?xf64>>) -> (!fir.box<!fir.array<?x?xf64>>, !fir.box<!fir.array<?x?xf64>>)
+// CHECK:           %[[VAL_10:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFddxEi"}
+// CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFddxEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_12:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFddxEj"}
+// CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFddxEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_9]]#0, %[[VAL_8]] : (!fir.box<!fir.array<?x?xf64>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]]#1 : (index) -> i64
+// CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i64) -> index
+// CHECK:           %[[VAL_17:.*]] = arith.cmpi sgt, %[[VAL_16]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_18:.*]] = arith.select %[[VAL_17]], %[[VAL_16]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_19:.*]]:3 = fir.box_dims %[[VAL_9]]#0, %[[VAL_7]] : (!fir.box<!fir.array<?x?xf64>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_19]]#1 : (index) -> i64
+// CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i64) -> index
+// CHECK:           %[[VAL_22:.*]] = arith.cmpi sgt, %[[VAL_21]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_21]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_24:.*]] = fir.alloca !fir.array<?x?xf64>, %[[VAL_18]], %[[VAL_23]] {bindc_name = "ddx", uniq_name = "_QFddxEddx"}
+// CHECK:           %[[VAL_25:.*]] = fir.shape %[[VAL_18]], %[[VAL_23]] : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_24]](%[[VAL_25]]) {uniq_name = "_QFddxEddx"} : (!fir.ref<!fir.array<?x?xf64>>, !fir.shape<2>) -> (!fir.box<!fir.array<?x?xf64>>, !fir.ref<!fir.array<?x?xf64>>)
+// CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_14]]#1 : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_27]] to %[[VAL_11]]#0 : i32, !fir.ref<i32>
+// CHECK:           %[[VAL_28:.*]] = fir.convert %[[VAL_19]]#1 : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_28]] to %[[VAL_13]]#0 : i32, !fir.ref<i32>
+// CHECK:           %[[VAL_29:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+// CHECK:           %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i32) -> index
+// CHECK:           %[[VAL_31:.*]] = arith.addi %[[VAL_30]], %[[VAL_2]] : index
+// CHECK:           %[[VAL_32:.*]] = arith.cmpi sgt, %[[VAL_31]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_33:.*]] = arith.select %[[VAL_32]], %[[VAL_31]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_34:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i32>
+// CHECK:           %[[VAL_35:.*]] = fir.convert %[[VAL_34]] : (i32) -> index
+// CHECK:           %[[VAL_36:.*]] = arith.cmpi sgt, %[[VAL_35]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_37:.*]] = arith.select %[[VAL_36]], %[[VAL_35]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_38:.*]] = fir.shape %[[VAL_33]], %[[VAL_37]] : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_39:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_6]]:%[[VAL_30]]:%[[VAL_7]], %[[VAL_7]]:%[[VAL_35]]:%[[VAL_7]])  shape %[[VAL_38]] : (!fir.box<!fir.array<?x?xf64>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf64>>
+// CHECK:           %[[VAL_40:.*]] = arith.subi %[[VAL_29]], %[[VAL_5]] : i32
+// CHECK:           %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> index
+// CHECK:           %[[VAL_42:.*]] = arith.cmpi sgt, %[[VAL_41]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_43:.*]] = arith.select %[[VAL_42]], %[[VAL_41]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_44:.*]] = fir.shape %[[VAL_43]], %[[VAL_37]] : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_45:.*]] = hlfir.designate %[[VAL_9]]#0 (%[[VAL_7]]:%[[VAL_41]]:%[[VAL_7]], %[[VAL_7]]:%[[VAL_35]]:%[[VAL_7]])  shape %[[VAL_44]] : (!fir.box<!fir.array<?x?xf64>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf64>>
+// CHECK:           %[[VAL_46:.*]] = arith.subi %[[VAL_29]], %[[VAL_3]] : i32
+// CHECK:           %[[VAL_47:.*]] = fir.convert %[[VAL_46]] : (i32) -> index
+// CHECK:           %[[VAL_48:.*]] = arith.addi %[[VAL_47]], %[[VAL_1]] : index
+// CHECK:           %[[VAL_49:.*]] = arith.cmpi sgt, %[[VAL_48]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_50:.*]] = arith.select %[[VAL_49]], %[[VAL_48]], %[[VAL_8]] : index
+// CHECK:           %[[VAL_51:.*]] = fir.shape %[[VAL_50]], %[[VAL_37]] : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_52:.*]] = hlfir.designate %[[VAL_26]]#0 (%[[VAL_4]]:%[[VAL_47]]:%[[VAL_7]], %[[VAL_7]]:%[[VAL_35]]:%[[VAL_7]])  shape %[[VAL_51]] : (!fir.box<!fir.array<?x?xf64>>, index, index, index, index, index, index, !fir.shape<2>) -> !fir.box<!fir.array<?x?xf64>>
+// CHECK:           fir.do_loop %[[VAL_53:.*]] = %[[VAL_7]] to %[[VAL_37]] step %[[VAL_7]] unordered {
+// CHECK:             fir.do_loop %[[VAL_54:.*]] = %[[VAL_7]] to %[[VAL_33]] step %[[VAL_7]] unordered {
+// CHECK:               %[[VAL_55:.*]] = hlfir.designate %[[VAL_39]] (%[[VAL_54]], %[[VAL_53]])  : (!fir.box<!fir.array<?x?xf64>>, index, index) -> !fir.ref<f64>
+// CHECK:               %[[VAL_56:.*]] = hlfir.designate %[[VAL_45]] (%[[VAL_54]], %[[VAL_53]])  : (!fir.box<!fir.array<?x?xf64>>, index, index) -> !fir.ref<f64>
+// CHECK:               %[[VAL_57:.*]] = fir.load %[[VAL_55]] : !fir.ref<f64>
+// CHECK:               %[[VAL_58:.*]] = fir.load %[[VAL_56]] : !fir.ref<f64>
+// CHECK:               %[[VAL_59:.*]] = arith.subf %[[VAL_57]], %[[VAL_58]] fastmath<contract> : f64
+// CHECK:               %[[VAL_60:.*]] = hlfir.designate %[[VAL_52]] (%[[VAL_54]], %[[VAL_53]])  : (!fir.box<!fir.array<?x?xf64>>, index, index) -> !fir.ref<f64>
+// CHECK:               hlfir.assign %[[VAL_59]] to %[[VAL_60]] : f64, !fir.ref<f64>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           %[[VAL_61:.*]] = fir.load %[[VAL_26]]#1 : !fir.ref<!fir.array<?x?xf64>>
+// CHECK:           return %[[VAL_61]] : !fir.array<?x?xf64>
+// CHECK:         }