[flang-commits] [flang] [flang] Enable loop-versioning for slices. (PR #120344)

Wed Dec 18 11:48:26 PST 2024

https://github.com/vzakhari updated https://github.com/llvm/llvm-project/pull/120344

>From 0f6cb878b67d5b69afa669caaa501f6c544528df Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Tue, 17 Dec 2024 17:16:32 -0800
Subject: [PATCH] [flang] Enable loop-versioning for slices.

Loops resulting from array expressions like array(:,i)
may be versioned for the unit stride of the innermost dimension,
when the initial array is an assumed-shape array (which are contiguous
in many Fortran programs).
This speeds up facerec for about 12% due to further vectorization
of the innermost loop produced for the total SUM reduction.
---
 .../Optimizer/Transforms/LoopVersioning.cpp   | 116 ++++--
 flang/test/Transforms/loop-versioning.fir     | 353 ++++++++++++++++++
 2 files changed, 442 insertions(+), 27 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
index adc39861840ab1..b534ec160ce215 100644
--- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
+++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
@@ -145,11 +145,45 @@ struct ArgsUsageInLoop {
 };
 } // namespace
 
-static fir::SequenceType getAsSequenceType(mlir::Value *v) {
-  mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v->getType()));
+static fir::SequenceType getAsSequenceType(mlir::Value v) {
+  mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v.getType()));
   return mlir::dyn_cast<fir::SequenceType>(argTy);
 }
 
+/// Return the rank and the element size (in bytes) of the given
+/// value \p v. If it is not an array or the element type is not
+/// supported, then return <0, 0>. Only trivial data types
+/// are currently supported.
+/// When \p isArgument is true, \p v is assumed to be a function
+/// argument. If \p v's type does not look like a type of an assumed
+/// shape array, then the function returns <0, 0>.
+/// When \p isArgument is false, array types with known innermost
+/// dimension are allowed to proceed.
+static std::pair<unsigned, size_t>
+getRankAndElementSize(const fir::KindMapping &kindMap,
+                      const mlir::DataLayout &dl, mlir::Value v,
+                      bool isArgument = false) {
+  if (auto seqTy = getAsSequenceType(v)) {
+    unsigned rank = seqTy.getDimension();
+    if (rank > 0 &&
+        (!isArgument ||
+         seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent())) {
+      size_t typeSize = 0;
+      mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(v.getType());
+      if (fir::isa_trivial(elementType)) {
+        auto [eleSize, eleAlign] = fir::getTypeSizeAndAlignmentOrCrash(
+            v.getLoc(), elementType, dl, kindMap);
+        typeSize = llvm::alignTo(eleSize, eleAlign);
+      }
+      if (typeSize)
+        return {rank, typeSize};
+    }
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "Unsupported rank/type: " << v << '\n');
+  return {0, 0};
+}
+
 /// if a value comes from a fir.declare, follow it to the original source,
 /// otherwise return the value
 static mlir::Value unwrapFirDeclare(mlir::Value val) {
@@ -160,12 +194,48 @@ static mlir::Value unwrapFirDeclare(mlir::Value val) {
   return val;
 }
 
+/// Return true, if \p rebox operation keeps the input array
+/// continuous in the innermost dimension, if it is initially continuous
+/// in the innermost dimension.
+static bool reboxPreservesContinuity(fir::ReboxOp rebox) {
+  // If slicing is not involved, then the rebox does not affect
+  // the continuity of the array.
+  auto sliceArg = rebox.getSlice();
+  if (!sliceArg)
+    return true;
+
+  // A slice with step=1 in the innermost dimension preserves
+  // the continuity of the array in the innermost dimension.
+  if (auto sliceOp =
+          mlir::dyn_cast_or_null<fir::SliceOp>(sliceArg.getDefiningOp())) {
+    if (sliceOp.getFields().empty() && sliceOp.getSubstr().empty()) {
+      auto triples = sliceOp.getTriples();
+      if (triples.size() > 2)
+        if (auto innermostStep = fir::getIntIfConstant(triples[2]))
+          if (*innermostStep == 1)
+            return true;
+    }
+
+    LLVM_DEBUG(llvm::dbgs()
+               << "REBOX with slicing may produce non-contiguous array: "
+               << sliceOp << '\n'
+               << rebox << '\n');
+    return false;
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "REBOX with unknown slice" << sliceArg << '\n'
+                          << rebox << '\n');
+  return false;
+}
+
 /// if a value comes from a fir.rebox, follow the rebox to the original source,
 /// of the value, otherwise return the value
 static mlir::Value unwrapReboxOp(mlir::Value val) {
-  // don't support reboxes of reboxes
-  if (fir::ReboxOp rebox = val.getDefiningOp<fir::ReboxOp>())
+  while (fir::ReboxOp rebox = val.getDefiningOp<fir::ReboxOp>()) {
+    if (!reboxPreservesContinuity(rebox))
+      break;
     val = rebox.getBox();
+  }
   return val;
 }
 
@@ -257,25 +327,10 @@ void LoopVersioningPass::runOnOperation() {
       continue;
     }
 
-    if (auto seqTy = getAsSequenceType(&arg)) {
-      unsigned rank = seqTy.getDimension();
-      if (rank > 0 &&
-          seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent()) {
-        size_t typeSize = 0;
-        mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType());
-        if (mlir::isa<mlir::FloatType>(elementType) ||
-            mlir::isa<mlir::IntegerType>(elementType) ||
-            mlir::isa<mlir::ComplexType>(elementType)) {
-          auto [eleSize, eleAlign] = fir::getTypeSizeAndAlignmentOrCrash(
-              arg.getLoc(), elementType, *dl, kindMap);
-          typeSize = llvm::alignTo(eleSize, eleAlign);
-        }
-        if (typeSize)
-          argsOfInterest.push_back({arg, typeSize, rank, {}});
-        else
-          LLVM_DEBUG(llvm::dbgs() << "Type not supported\n");
-      }
-    }
+    auto [rank, typeSize] =
+        getRankAndElementSize(kindMap, *dl, arg, /*isArgument=*/true);
+    if (rank != 0 && typeSize != 0)
+      argsOfInterest.push_back({arg, typeSize, rank, {}});
   }
 
   if (argsOfInterest.empty()) {
@@ -326,6 +381,13 @@ void LoopVersioningPass::runOnOperation() {
             if (arrayCoor.getSlice())
               argsInLoop.cannotTransform.insert(a.arg);
 
+          // We need to compute the rank and element size
+          // based on the operand, not the original argument,
+          // because array slicing may affect it.
+          std::tie(a.rank, a.size) = getRankAndElementSize(kindMap, *dl, a.arg);
+          if (a.rank == 0 || a.size == 0)
+            argsInLoop.cannotTransform.insert(a.arg);
+
           if (argsInLoop.cannotTransform.contains(a.arg)) {
             // Remove any previously recorded usage, if any.
             argsInLoop.usageInfo.erase(a.arg);
@@ -416,8 +478,8 @@ void LoopVersioningPass::runOnOperation() {
   mlir::Location loc = builder.getUnknownLoc();
   mlir::IndexType idxTy = builder.getIndexType();
 
-  LLVM_DEBUG(llvm::dbgs() << "Module Before transformation:");
-  LLVM_DEBUG(module->dump());
+  LLVM_DEBUG(llvm::dbgs() << "Func Before transformation:\n");
+  LLVM_DEBUG(func->dump());
 
   LLVM_DEBUG(llvm::dbgs() << "loopsOfInterest: " << loopsOfInterest.size()
                           << "\n");
@@ -551,8 +613,8 @@ void LoopVersioningPass::runOnOperation() {
     }
   }
 
-  LLVM_DEBUG(llvm::dbgs() << "After transform:\n");
-  LLVM_DEBUG(module->dump());
+  LLVM_DEBUG(llvm::dbgs() << "Func After transform:\n");
+  LLVM_DEBUG(func->dump());
 
   LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n");
 }
diff --git a/flang/test/Transforms/loop-versioning.fir b/flang/test/Transforms/loop-versioning.fir
index 7528d14b3670d5..2f7c439ed3f4e1 100644
--- a/flang/test/Transforms/loop-versioning.fir
+++ b/flang/test/Transforms/loop-versioning.fir
@@ -113,8 +113,10 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // CHECK-LABEL: func.func @sum1dfixed(
 // CHECK-SAME:                        %[[ARG0:.*]]: !fir.ref<!fir.array<?xf64>> {{.*}})
 // CHECK: fir.do_loop {{.*}}
+// CHECK-NOT: fir.do_loop
 // CHECK: %[[COORD:.*]] = fir.coordinate_of %[[ARG0]], {{.*}}
 // CHECK: %{{.*}} = fir.load %[[COORD]]
+// CHECK-NOT: fir.do_loop
 
 // -----
 
@@ -1641,4 +1643,355 @@ func.func @_QPtest_complex10(%arg0: !fir.box<!fir.array<?x?xcomplex<f80>>> {fir.
 // CHECK:           } else {
 // CHECK:             fir.do_loop
 
+// Test that the loop is not versioned with non-contiguous slices:
+//subroutine test_step2_slice(x, y)
+//  real :: x(:,:), y(:,:)
+//  do i=1,10
+//     x(::2,i) = y(::2,i) + 1.0
+//  end do
+//end subroutine
+func.func @_QPtest_step2_slice(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) {
+  %c10 = arith.constant 10 : index
+  %cst = arith.constant 1.000000e+00 : f32
+  %c2 = arith.constant 2 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_step2_sliceEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFtest_step2_sliceEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFtest_step2_sliceEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?x?xf32>>
+  %4 = fir.rebox %3 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>>
+  %5 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFtest_step2_sliceEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?x?xf32>>
+  %6 = fir.rebox %5 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>>
+  %7 = fir.convert %c1 : (index) -> i32
+  %8:2 = fir.do_loop %arg2 = %c1 to %c10 step %c1 iter_args(%arg3 = %7) -> (index, i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %9:3 = fir.box_dims %6, %c0 : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+    %10 = arith.addi %9#1, %c1 : index
+    %11 = arith.divsi %10, %c2 : index
+    %12 = arith.cmpi sgt, %11, %c0 : index
+    %13 = arith.select %12, %11, %c0 : index
+    %14 = fir.load %2 : !fir.ref<i32>
+    %15 = fir.convert %14 : (i32) -> i64
+    %16 = fir.undefined index
+    %17 = fir.slice %c1, %9#1, %c2, %15, %16, %16 : (index, index, index, i64, index, index) -> !fir.slice<2>
+    %18 = fir.rebox %6 [%17] : (!fir.box<!fir.array<?x?xf32>>, !fir.slice<2>) -> !fir.box<!fir.array<?xf32>>
+    %19:3 = fir.box_dims %4, %c0 : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+    %20 = fir.slice %c1, %19#1, %c2, %15, %16, %16 : (index, index, index, i64, index, index) -> !fir.slice<2>
+    %21 = fir.rebox %4 [%20] : (!fir.box<!fir.array<?x?xf32>>, !fir.slice<2>) -> !fir.box<!fir.array<?xf32>>
+    fir.do_loop %arg4 = %c1 to %13 step %c1 unordered {
+      %25 = fir.array_coor %18 %arg4 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+      %26 = fir.load %25 : !fir.ref<f32>
+      %27 = arith.addf %26, %cst fastmath<fast> : f32
+      %28 = fir.array_coor %21 %arg4 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+      fir.store %27 to %28 : !fir.ref<f32>
+    }
+    %22 = arith.addi %arg2, %c1 overflow<nsw> : index
+    %23 = fir.load %2 : !fir.ref<i32>
+    %24 = arith.addi %23, %7 overflow<nsw> : i32
+    fir.result %22, %24 : index, i32
+  }
+  fir.store %8#1 to %2 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest_step2_slice(
+// CHECK-NOT: fir.if
+
+// Test that the loop is versioned with most probably
+// contiguous slices:
+//subroutine test_step1_slice(x, y)
+//  real :: x(:,:), y(:,:)
+//  do i=1,10
+//     x(:,i) = y(:,i) + 1.0
+//  end do
+//end subroutine
+func.func @_QPtest_step1_slice(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) {
+  %c10 = arith.constant 10 : index
+  %cst = arith.constant 1.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_step1_sliceEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFtest_step1_sliceEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFtest_step1_sliceEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?x?xf32>>
+  %4 = fir.rebox %3 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>>
+  %5 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFtest_step1_sliceEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?x?xf32>>
+  %6 = fir.rebox %5 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>>
+  %7 = fir.convert %c1 : (index) -> i32
+  %8:2 = fir.do_loop %arg2 = %c1 to %c10 step %c1 iter_args(%arg3 = %7) -> (index, i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %9:3 = fir.box_dims %6, %c0 : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+    %10 = arith.cmpi sgt, %9#1, %c0 : index
+    %11 = arith.select %10, %9#1, %c0 : index
+    %12 = fir.load %2 : !fir.ref<i32>
+    %13 = fir.convert %12 : (i32) -> i64
+    %14 = fir.undefined index
+    %15 = fir.slice %c1, %9#1, %c1, %13, %14, %14 : (index, index, index, i64, index, index) -> !fir.slice<2>
+    %16 = fir.rebox %6 [%15] : (!fir.box<!fir.array<?x?xf32>>, !fir.slice<2>) -> !fir.box<!fir.array<?xf32>>
+    %17:3 = fir.box_dims %4, %c0 : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+    %18 = fir.slice %c1, %17#1, %c1, %13, %14, %14 : (index, index, index, i64, index, index) -> !fir.slice<2>
+    %19 = fir.rebox %4 [%18] : (!fir.box<!fir.array<?x?xf32>>, !fir.slice<2>) -> !fir.box<!fir.array<?xf32>>
+    fir.do_loop %arg4 = %c1 to %11 step %c1 unordered {
+      %23 = fir.array_coor %16 %arg4 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+      %24 = fir.load %23 : !fir.ref<f32>
+      %25 = arith.addf %24, %cst fastmath<fast> : f32
+      %26 = fir.array_coor %19 %arg4 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+      fir.store %25 to %26 : !fir.ref<f32>
+    }
+    %20 = arith.addi %arg2, %c1 overflow<nsw> : index
+    %21 = fir.load %2 : !fir.ref<i32>
+    %22 = arith.addi %21, %7 overflow<nsw> : i32
+    fir.result %20, %22 : index, i32
+  }
+  fir.store %8#1 to %2 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest_step1_slice(
+// CHECK:           fir.do_loop
+// CHECK:             fir.if
+// CHECK:               fir.do_loop
+// CHECK:             } else {
+// CHECK:               fir.do_loop
+
+// Test that the loop is versioned with logical arrays:
+//subroutine test_logical_slice(x, y)
+//  logical :: x(:,:), y(:,:)
+//  do i=1,10
+//     x(:,i) = y(:,i) .or. y(i,:)
+//  end do
+//end subroutine
+func.func @_QPtest_logical_slice(%arg0: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc_name = "x"}, %arg1: !fir.box<!fir.array<?x?x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+  %c10 = arith.constant 10 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_logical_sliceEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFtest_logical_sliceEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFtest_logical_sliceEx"} : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, !fir.dscope) -> !fir.box<!fir.array<?x?x!fir.logical<4>>>
+  %4 = fir.rebox %3 : (!fir.box<!fir.array<?x?x!fir.logical<4>>>) -> !fir.box<!fir.array<?x?x!fir.logical<4>>>
+  %5 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFtest_logical_sliceEy"} : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, !fir.dscope) -> !fir.box<!fir.array<?x?x!fir.logical<4>>>
+  %6 = fir.rebox %5 : (!fir.box<!fir.array<?x?x!fir.logical<4>>>) -> !fir.box<!fir.array<?x?x!fir.logical<4>>>
+  %7 = fir.convert %c1 : (index) -> i32
+  %8:2 = fir.do_loop %arg2 = %c1 to %c10 step %c1 iter_args(%arg3 = %7) -> (index, i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %9:3 = fir.box_dims %6, %c0 : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, index) -> (index, index, index)
+    %10 = arith.cmpi sgt, %9#1, %c0 : index
+    %11 = arith.select %10, %9#1, %c0 : index
+    %12 = fir.load %2 : !fir.ref<i32>
+    %13 = fir.convert %12 : (i32) -> i64
+    %14 = fir.undefined index
+    %15 = fir.slice %c1, %9#1, %c1, %13, %14, %14 : (index, index, index, i64, index, index) -> !fir.slice<2>
+    %16 = fir.rebox %6 [%15] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, !fir.slice<2>) -> !fir.box<!fir.array<?x!fir.logical<4>>>
+    %17:3 = fir.box_dims %6, %c1 : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, index) -> (index, index, index)
+    %18 = fir.slice %13, %14, %14, %c1, %17#1, %c1 : (i64, index, index, index, index, index) -> !fir.slice<2>
+    %19 = fir.rebox %6 [%18] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, !fir.slice<2>) -> !fir.box<!fir.array<?x!fir.logical<4>>>
+    %20:3 = fir.box_dims %4, %c0 : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, index) -> (index, index, index)
+    %21 = fir.slice %c1, %20#1, %c1, %13, %14, %14 : (index, index, index, i64, index, index) -> !fir.slice<2>
+    %22 = fir.rebox %4 [%21] : (!fir.box<!fir.array<?x?x!fir.logical<4>>>, !fir.slice<2>) -> !fir.box<!fir.array<?x!fir.logical<4>>>
+    fir.do_loop %arg4 = %c1 to %11 step %c1 unordered {
+      %26 = fir.array_coor %16 %arg4 : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+      %27 = fir.array_coor %19 %arg4 : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+      %28 = fir.load %26 : !fir.ref<!fir.logical<4>>
+      %29 = fir.load %27 : !fir.ref<!fir.logical<4>>
+      %30 = fir.convert %28 : (!fir.logical<4>) -> i1
+      %31 = fir.convert %29 : (!fir.logical<4>) -> i1
+      %32 = arith.ori %30, %31 : i1
+      %33 = fir.convert %32 : (i1) -> !fir.logical<4>
+      %34 = fir.array_coor %22 %arg4 : (!fir.box<!fir.array<?x!fir.logical<4>>>, index) -> !fir.ref<!fir.logical<4>>
+      fir.store %33 to %34 : !fir.ref<!fir.logical<4>>
+    }
+    %23 = arith.addi %arg2, %c1 overflow<nsw> : index
+    %24 = fir.load %2 : !fir.ref<i32>
+    %25 = arith.addi %24, %7 overflow<nsw> : i32
+    fir.result %23, %25 : index, i32
+  }
+  fir.store %8#1 to %2 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest_logical_slice(
+// CHECK:           fir.do_loop
+// CHECK:             fir.if
+// CHECK:               fir.do_loop
+// CHECK:             } else {
+// CHECK:               fir.do_loop
+
+// Test that the loop is versioned when a most probably
+// contiguous slices have known shape:
+//subroutine test_known_shape_slice(x, y)
+//  integer :: x(:,:), y(:,:)
+//  do i=1,10
+//     x(1:10,i) = y(1:10,i) + 1
+//  end do
+//end subroutine
+func.func @_QPtest_known_shape_slice(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "x"}, %arg1: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "y"}) {
+  %c10 = arith.constant 10 : index
+  %c1 = arith.constant 1 : index
+  %c1_i32 = arith.constant 1 : i32
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_known_shape_sliceEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFtest_known_shape_sliceEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFtest_known_shape_sliceEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> !fir.box<!fir.array<?x?xi32>>
+  %4 = fir.rebox %3 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<?x?xi32>>
+  %5 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFtest_known_shape_sliceEy"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> !fir.box<!fir.array<?x?xi32>>
+  %6 = fir.rebox %5 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<!fir.array<?x?xi32>>
+  %7 = fir.convert %c1 : (index) -> i32
+  %8:2 = fir.do_loop %arg2 = %c1 to %c10 step %c1 iter_args(%arg3 = %7) -> (index, i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %9 = fir.load %2 : !fir.ref<i32>
+    %10 = fir.convert %9 : (i32) -> i64
+    %11 = fir.undefined index
+    %12 = fir.slice %c1, %c10, %c1, %10, %11, %11 : (index, index, index, i64, index, index) -> !fir.slice<2>
+    %13 = fir.rebox %6 [%12] : (!fir.box<!fir.array<?x?xi32>>, !fir.slice<2>) -> !fir.box<!fir.array<10xi32>>
+    %14 = fir.rebox %4 [%12] : (!fir.box<!fir.array<?x?xi32>>, !fir.slice<2>) -> !fir.box<!fir.array<10xi32>>
+    fir.do_loop %arg4 = %c1 to %c10 step %c1 unordered {
+      %18 = fir.array_coor %13 %arg4 : (!fir.box<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+      %19 = fir.load %18 : !fir.ref<i32>
+      %20 = arith.addi %19, %c1_i32 : i32
+      %21 = fir.array_coor %14 %arg4 : (!fir.box<!fir.array<10xi32>>, index) -> !fir.ref<i32>
+      fir.store %20 to %21 : !fir.ref<i32>
+    }
+    %15 = arith.addi %arg2, %c1 overflow<nsw> : index
+    %16 = fir.load %2 : !fir.ref<i32>
+    %17 = arith.addi %16, %7 overflow<nsw> : i32
+    fir.result %15, %17 : index, i32
+  }
+  fir.store %8#1 to %2 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest_known_shape_slice(
+// CHECK:           fir.do_loop
+// CHECK:             fir.if
+// CHECK:               fir.do_loop
+// CHECK:             } else {
+// CHECK:               fir.do_loop
+
+// Test that the loop is not versioned for most probably
+// not-contiguous slices:
+//subroutine test_maybe_noncontig_slice(x, y)
+//  real :: x(:,:), y(:,:)
+//  do i=1,10
+//     x(i,:) = y(i,:) + 1.0
+//  end do
+//end subroutine
+func.func @_QPtest_maybe_noncontig_slice(%arg0: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.box<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) {
+  %c10 = arith.constant 10 : index
+  %cst = arith.constant 1.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_maybe_noncontig_sliceEi"}
+  %2 = fir.declare %1 {uniq_name = "_QFtest_maybe_noncontig_sliceEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFtest_maybe_noncontig_sliceEx"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?x?xf32>>
+  %4 = fir.rebox %3 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>>
+  %5 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFtest_maybe_noncontig_sliceEy"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?x?xf32>>
+  %6 = fir.rebox %5 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>>
+  %7 = fir.convert %c1 : (index) -> i32
+  %8:2 = fir.do_loop %arg2 = %c1 to %c10 step %c1 iter_args(%arg3 = %7) -> (index, i32) {
+    fir.store %arg3 to %2 : !fir.ref<i32>
+    %9 = fir.load %2 : !fir.ref<i32>
+    %10 = fir.convert %9 : (i32) -> i64
+    %11:3 = fir.box_dims %6, %c1 : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+    %12 = arith.cmpi sgt, %11#1, %c0 : index
+    %13 = arith.select %12, %11#1, %c0 : index
+    %14 = fir.undefined index
+    %15 = fir.slice %10, %14, %14, %c1, %11#1, %c1 : (i64, index, index, index, index, index) -> !fir.slice<2>
+    %16 = fir.rebox %6 [%15] : (!fir.box<!fir.array<?x?xf32>>, !fir.slice<2>) -> !fir.box<!fir.array<?xf32>>
+    %17:3 = fir.box_dims %4, %c1 : (!fir.box<!fir.array<?x?xf32>>, index) -> (index, index, index)
+    %18 = fir.slice %10, %14, %14, %c1, %17#1, %c1 : (i64, index, index, index, index, index) -> !fir.slice<2>
+    %19 = fir.rebox %4 [%18] : (!fir.box<!fir.array<?x?xf32>>, !fir.slice<2>) -> !fir.box<!fir.array<?xf32>>
+    fir.do_loop %arg4 = %c1 to %13 step %c1 unordered {
+      %23 = fir.array_coor %16 %arg4 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+      %24 = fir.load %23 : !fir.ref<f32>
+      %25 = arith.addf %24, %cst fastmath<fast> : f32
+      %26 = fir.array_coor %19 %arg4 : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+      fir.store %25 to %26 : !fir.ref<f32>
+    }
+    %20 = arith.addi %arg2, %c1 overflow<nsw> : index
+    %21 = fir.load %2 : !fir.ref<i32>
+    %22 = arith.addi %21, %7 overflow<nsw> : i32
+    fir.result %20, %22 : index, i32
+  }
+  fir.store %8#1 to %2 : !fir.ref<i32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPtest_maybe_noncontig_slice(
+// CHECK-NOT: fir.if
+
+// Regression test for facerec's GraphSimFct:
+//real function test_graphsimfct(a1, a2)
+//  integer :: i
+//  real, intent(in) :: a1(:,:,:)
+//  real, intent(in) :: a2(:,:,:,:)
+//  graphsimfct = 0.0
+//  do i=1,10
+//     test_graphsimfct = test_graphsimfct + SUM(a1(:,:,i) * a2(:,:,i,i))
+//  end do
+//end function
+func.func @_QPtest_graphsimfct(%arg0: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "a1"}, %arg1: !fir.box<!fir.array<?x?x?x?xf32>> {fir.bindc_name = "a2"}) -> f32 {
+  %c10 = arith.constant 10 : index
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_graphsimfctEa1"} : (!fir.box<!fir.array<?x?x?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?x?x?xf32>>
+  %2 = fir.rebox %1 : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<!fir.array<?x?x?xf32>>
+  %3 = fir.declare %arg1 dummy_scope %0 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QFtest_graphsimfctEa2"} : (!fir.box<!fir.array<?x?x?x?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?x?x?x?xf32>>
+  %4 = fir.rebox %3 : (!fir.box<!fir.array<?x?x?x?xf32>>) -> !fir.box<!fir.array<?x?x?x?xf32>>
+  %5 = fir.alloca f32 {bindc_name = "graphsimfct", uniq_name = "_QFtest_graphsimfctEgraphsimfct"}
+  %6 = fir.declare %5 {uniq_name = "_QFtest_graphsimfctEgraphsimfct"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  %7 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_graphsimfctEi"}
+  %8 = fir.declare %7 {uniq_name = "_QFtest_graphsimfctEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %9 = fir.alloca f32 {bindc_name = "test_graphsimfct", uniq_name = "_QFtest_graphsimfctEtest_graphsimfct"}
+  %10 = fir.declare %9 {uniq_name = "_QFtest_graphsimfctEtest_graphsimfct"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  fir.store %cst to %6 : !fir.ref<f32>
+  %11 = fir.convert %c1 : (index) -> i32
+  %12:2 = fir.do_loop %arg2 = %c1 to %c10 step %c1 iter_args(%arg3 = %11) -> (index, i32) {
+    fir.store %arg3 to %8 : !fir.ref<i32>
+    %14 = fir.load %10 : !fir.ref<f32>
+    %15:3 = fir.box_dims %2, %c0 : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+    %16:3 = fir.box_dims %2, %c1 : (!fir.box<!fir.array<?x?x?xf32>>, index) -> (index, index, index)
+    %17 = arith.cmpi sgt, %15#1, %c0 : index
+    %18 = arith.select %17, %15#1, %c0 : index
+    %19 = arith.cmpi sgt, %16#1, %c0 : index
+    %20 = arith.select %19, %16#1, %c0 : index
+    %21 = fir.load %8 : !fir.ref<i32>
+    %22 = fir.convert %21 : (i32) -> i64
+    %23 = fir.undefined index
+    %24 = fir.slice %c1, %15#1, %c1, %c1, %16#1, %c1, %22, %23, %23 : (index, index, index, index, index, index, i64, index, index) -> !fir.slice<3>
+    %25 = fir.rebox %2 [%24] : (!fir.box<!fir.array<?x?x?xf32>>, !fir.slice<3>) -> !fir.box<!fir.array<?x?xf32>>
+    %26:3 = fir.box_dims %4, %c0 : (!fir.box<!fir.array<?x?x?x?xf32>>, index) -> (index, index, index)
+    %27:3 = fir.box_dims %4, %c1 : (!fir.box<!fir.array<?x?x?x?xf32>>, index) -> (index, index, index)
+    %28 = fir.slice %c1, %26#1, %c1, %c1, %27#1, %c1, %22, %23, %23, %22, %23, %23 : (index, index, index, index, index, index, i64, index, index, i64, index, index) -> !fir.slice<4>
+    %29 = fir.rebox %4 [%28] : (!fir.box<!fir.array<?x?x?x?xf32>>, !fir.slice<4>) -> !fir.box<!fir.array<?x?xf32>>
+    %30 = fir.do_loop %arg4 = %c1 to %20 step %c1 unordered iter_args(%arg5 = %cst) -> (f32) {
+      %35 = fir.do_loop %arg6 = %c1 to %18 step %c1 unordered iter_args(%arg7 = %arg5) -> (f32) {
+        %36 = fir.array_coor %25 %arg6, %arg4 : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+        %37 = fir.array_coor %29 %arg6, %arg4 : (!fir.box<!fir.array<?x?xf32>>, index, index) -> !fir.ref<f32>
+        %38 = fir.load %36 : !fir.ref<f32>
+        %39 = fir.load %37 : !fir.ref<f32>
+        %40 = arith.mulf %38, %39 fastmath<fast> : f32
+        %41 = arith.addf %arg7, %40 fastmath<fast> : f32
+        fir.result %41 : f32
+      }
+      fir.result %35 : f32
+    }
+    %31 = arith.addf %14, %30 fastmath<fast> : f32
+    fir.store %31 to %10 : !fir.ref<f32>
+    %32 = arith.addi %arg2, %c1 overflow<nsw> : index
+    %33 = fir.load %8 : !fir.ref<i32>
+    %34 = arith.addi %33, %11 overflow<nsw> : i32
+    fir.result %32, %34 : index, i32
+  }
+  fir.store %12#1 to %8 : !fir.ref<i32>
+  %13 = fir.load %10 : !fir.ref<f32>
+  return %13 : f32
+}
+// CHECK-LABEL:   func.func @_QPtest_graphsimfct(
+// CHECK:           fir.do_loop
+// CHECK:             fir.do_loop
+// CHECK:               fir.if
+// CHECK:                 fir.do_loop
+// CHECK:               } else {
+// CHECK:                 fir.do_loop
+
 } // End module