[flang-commits] [flang] af7edf1 - [flang] Keep original data type for do-variable value.

Tue Aug 23 15:56:52 PDT 2022

Author: Slava Zakharin
Date: 2022-08-23T15:54:54-07:00
New Revision: af7edf1557d87026bb4dd4783f60e766538e923c

URL: https://github.com/llvm/llvm-project/commit/af7edf1557d87026bb4dd4783f60e766538e923c
DIFF: https://github.com/llvm/llvm-project/commit/af7edf1557d87026bb4dd4783f60e766538e923c.diff

LOG: [flang] Keep original data type for do-variable value.

Keep the original data type of integer do-variables
for structured loops. When do-variable's data type
is an integer type shorter than IndexType, processing
the do-variable separately from the DoLoop's iteration index
allows getting rid of type casts, which can make backend
optimizations easier.

For example,
```
  do i = 2, n-1
    do j = 2, n-1
      ... = a(j-1, i)
    end do
  end do
```

If value of 'j' is computed by casting the DoLoop's iteration
index to 'i32', then Flang will produce the following LLVM IR:
```
  %1 = trunc i64 %iter_index to i32
  %2 = sub i32 %1, 1
  %3 = sext i32 %2 to i64
```

LLVM's InstCombine may try to get rid of the sign extension,
and may transform this into:
```
  %1 = shl i64 %iter_index, 32
  %2 = add i64 %1, -4294967296
  %3 = ashr exact i64 %2, 32
```

The extra computations for the element address applied on top
of this awkward pattern confuse LLVM vectorizer so that
it does not recognize the unit-strided access of 'a'.

Measured performance improvements on `SPEC CPU2000 at IceLake`:
```
168.wupwise:    11.96%
171.swim:       11.22%
172.mrgid:      56.38%
178.galgel:      7.29%
301.apsi:        8.32%
```

Differential Revision: https://reviews.llvm.org/D132176

Added: 
    

Modified: 
    flang/lib/Lower/Bridge.cpp
    flang/test/Lower/OpenMP/omp-parallel-private-clause-fixes.f90
    flang/test/Lower/OpenMP/omp-wsloop-variable.f90
    flang/test/Lower/array-expression-slice-1.f90
    flang/test/Lower/do_loop.f90
    flang/test/Lower/do_loop_unstructured.f90
    flang/test/Lower/infinite_loop.f90
    flang/test/Lower/loops.f90
    flang/test/Lower/loops2.f90
    flang/test/Lower/mixed_loops.f90

Removed: 
    


################################################################################
diff  --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 36348f90f247c..dd133f9d68fa6 100644

--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -77,6 +77,29 @@ struct IncrementLoopInfo {
 
   bool isStructured() const { return !headerBlock; }
 
+  /// \return true if for this do loop its do-variable's value
+  /// is represented as the block argument of the do loop's
+  /// region. In this case the data type of the block argument
+  /// matches the original data type of the do-variable as written
+  /// in user code, and the value is adjusted using the step value
+  /// on each iteration of the do loop.
+  ///
+  /// When do-variable's data type is an integer type shorter
+  /// than IndexType, processing the do-variable separately
+  /// from the do loop's iteration index allows getting rid
+  /// of type casts, which can make backend optimizations easier.
+  /// In particular, computing the do variable value from
+  /// the iteration index may introduce chains like trunc->arith->sext,
+  /// which may be optimized into sequences of shift operations
+  /// in InstCombine, which then prevents vectorizer from recognizing
+  /// unit-strided accesses.
+  ///
+  /// We could have disabled the extra iteration variable usage
+  /// for cases when its data type is not shorter than IndexType,
+  /// but this requires having proper DataLayout set up for the enclosing
+  /// module. This is currently blocked by llvm-project#57230 issue.
+  bool doVarIsALoopArg() const { return isStructured() && !isUnordered; }
+
   mlir::Type getLoopVariableType() const {
     assert(loopVariable && "must be set");
     return fir::unwrapRefType(loopVariable.getType());
@@ -96,6 +119,11 @@ struct IncrementLoopInfo {
 
   // Data members for structured loops.
   fir::DoLoopOp doLoop = nullptr;
+  // Do loop block argument holding the current value
+  // of the do-variable. It has the same data type as the original
+  // do-variable. It is non-null after genFIRIncrementLoopBegin()
+  // iff doVarIsALoopArg() returns true.
+  mlir::Value doVarValue = nullptr;
 
   // Data members for unstructured loops.
   bool hasRealControl = false;
@@ -166,7 +194,7 @@ class RuntimeTypeInfoConverter {
   llvm::SmallSetVector<Fortran::semantics::SymbolRef, 64> seen;
 };
 
-using IncrementLoopNestInfo = llvm::SmallVector<IncrementLoopInfo>;
+using IncrementLoopNestInfo = llvm::SmallVector<IncrementLoopInfo, 8>;
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -1227,13 +1255,28 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
       // Structured loop - generate fir.do_loop.
       if (info.isStructured()) {
+        mlir::Value doVarInit = nullptr;
+        if (info.doVarIsALoopArg())
+          doVarInit = builder->createConvert(loc, info.getLoopVariableType(),
+                                             lowerValue);
+
         info.doLoop = builder->create<fir::DoLoopOp>(
             loc, lowerValue, upperValue, info.stepValue, info.isUnordered,
-            /*finalCountValue=*/!info.isUnordered);
+            /*finalCountValue=*/!info.isUnordered,
+            doVarInit ? mlir::ValueRange{doVarInit} : mlir::ValueRange{});
         builder->setInsertionPointToStart(info.doLoop.getBody());
-        // Update the loop variable value, as it may have non-index references.
-        mlir::Value value = builder->createConvert(
-            loc, info.getLoopVariableType(), info.doLoop.getInductionVar());
+        mlir::Value value;
+        if (!doVarInit) {
+          // Update the loop variable value, as it may have non-index
+          // references.
+          value = builder->createConvert(loc, info.getLoopVariableType(),
+                                         info.doLoop.getInductionVar());
+        } else {
+          // The loop variable value is the region's argument rather
+          // than the DoLoop's index value.
+          value = info.doLoop.getRegionIterArgs()[0];
+          info.doVarValue = value;
+        }
         builder->create<fir::StoreOp>(loc, value, info.loopVariable);
         if (info.maskExpr) {
           Fortran::lower::StatementContext stmtCtx;
@@ -1324,16 +1367,35 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         // End fir.do_loop.
         if (!info.isUnordered) {
           builder->setInsertionPointToEnd(info.doLoop.getBody());
-          mlir::Value result = builder->create<mlir::arith::AddIOp>(
-              loc, info.doLoop.getInductionVar(), info.doLoop.getStep());
-          builder->create<fir::ResultOp>(loc, result);
+          llvm::SmallVector<mlir::Value, 2> results;
+          results.push_back(builder->create<mlir::arith::AddIOp>(
+              loc, info.doLoop.getInductionVar(), info.doLoop.getStep()));
+          if (info.doVarIsALoopArg()) {
+            // If we use an extra iteration variable of the same data
+            // type as the original do-variable, we have to increment
+            // it by the step value. Note that the step has 'index'
+            // type, so we need to cast it, first.
+            mlir::Value stepCast = builder->createConvert(
+                loc, info.getLoopVariableType(), info.doLoop.getStep());
+            results.push_back(builder->create<mlir::arith::AddIOp>(
+                loc, info.doVarValue, stepCast));
+          }
+          builder->create<fir::ResultOp>(loc, results);
         }
         builder->setInsertionPointAfter(info.doLoop);
         if (info.isUnordered)
           continue;
         // The loop control variable may be used after loop execution.
-        mlir::Value lcv = builder->createConvert(
-            loc, info.getLoopVariableType(), info.doLoop.getResult(0));
+        mlir::Value lcv = nullptr;
+        if (info.doVarIsALoopArg()) {
+          // Final do-variable value is the second result of the DoLoop.
+          assert(info.doLoop.getResults().size() == 2 &&
+                 "invalid do-variable handling");
+          lcv = info.doLoop.getResult(1);
+        } else {
+          lcv = builder->createConvert(loc, info.getLoopVariableType(),
+                                       info.doLoop.getResult(0));
+        }
         builder->create<fir::StoreOp>(loc, lcv, info.loopVariable);
         continue;
       }

diff  --git a/flang/test/Lower/OpenMP/omp-parallel-private-clause-fixes.f90 b/flang/test/Lower/OpenMP/omp-parallel-private-clause-fixes.f90
index d53543455f3b3..0d91c06a33921 100644
--- a/flang/test/Lower/OpenMP/omp-parallel-private-clause-fixes.f90
+++ b/flang/test/Lower/OpenMP/omp-parallel-private-clause-fixes.f90
@@ -20,18 +20,21 @@
 ! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
 ! CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
 ! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : index
-! CHECK:             %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_8]] to %[[VAL_10]] step %[[VAL_11]] -> index {
-! CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
-! CHECK:               fir.store %[[VAL_14]] to %[[PRIV_J]] : !fir.ref<i32>
+! CHECK:             %[[LB:.*]] = fir.convert %[[VAL_8]] : (index) -> i32
+! CHECK:             %[[VAL_12:.*]]:2 = fir.do_loop %[[VAL_13:[^ ]*]] =
+! CHECK-SAME:            %[[VAL_8]] to %[[VAL_10]] step %[[VAL_11]]
+! CHECK-SAME:            iter_args(%[[IV:.*]] = %[[LB]]) -> (index, i32) {
+! CHECK:               fir.store %[[IV]] to %[[PRIV_J]] : !fir.ref<i32>
 ! CHECK:               %[[LOAD:.*]] = fir.load %[[PRIV_I]] : !fir.ref<i32>
 ! CHECK:               %[[VAL_15:.*]] = fir.load %[[PRIV_J]] : !fir.ref<i32>
 ! CHECK:               %[[VAL_16:.*]] = arith.addi %[[LOAD]], %[[VAL_15]] : i32
 ! CHECK:               fir.store %[[VAL_16]] to %[[PRIV_X]] : !fir.ref<i32>
 ! CHECK:               %[[VAL_17:.*]] = arith.addi %[[VAL_13]], %[[VAL_11]] : index
-! CHECK:               fir.result %[[VAL_17]] : index
+! CHECK:               %[[STEPCAST:.*]] = fir.convert %[[VAL_11]] : (index) -> i32
+! CHECK:               %[[IVINC:.*]] = arith.addi %[[IV]], %[[STEPCAST]]
+! CHECK:               fir.result %[[VAL_17]], %[[IVINC]] : index, i32
 ! CHECK:             }
-! CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_19:.*]] : (index) -> i32
-! CHECK:             fir.store %[[VAL_18]] to %[[PRIV_J]] : !fir.ref<i32>
+! CHECK:             fir.store %[[VAL_12]]#1 to %[[PRIV_J]] : !fir.ref<i32>
 ! CHECK:             omp.yield
 ! CHECK:           }
 ! CHECK:           omp.terminator

diff  --git a/flang/test/Lower/OpenMP/omp-wsloop-variable.f90 b/flang/test/Lower/OpenMP/omp-wsloop-variable.f90
index 9e4ce30a2c5ef..10a7dded9209f 100644
--- a/flang/test/Lower/OpenMP/omp-wsloop-variable.f90
+++ b/flang/test/Lower/OpenMP/omp-wsloop-variable.f90
@@ -99,9 +99,11 @@ end program wsloop_variable
 !CHECK:           %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> index
 !CHECK:           %[[VAL_17:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
 !CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (i32) -> index
-!CHECK:           %[[VAL_19:.*]] = fir.do_loop %[[VAL_20:.*]] = %[[VAL_14]] to %[[VAL_16]] step %[[VAL_18]] -> index {
-!CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (index) -> i64
-!CHECK:             fir.store %[[VAL_21]] to %[[VAL_5]] : !fir.ref<i64>
+!CHECK:           %[[LB:.*]] = fir.convert %[[VAL_14]] : (index) -> i64
+!CHECK:           %[[VAL_19:.*]]:2 = fir.do_loop %[[VAL_20:[^ ]*]] =
+!CHECK-SAME:          %[[VAL_14]] to %[[VAL_16]] step %[[VAL_18]]
+!CHECK-SAME:          iter_args(%[[IV:.*]] = %[[LB]]) -> (index, i64) {
+!CHECK:             fir.store %[[IV]] to %[[VAL_5]] : !fir.ref<i64>
 !CHECK:             %[[LOAD_IV:.*]] = fir.load %[[STORE_IV]] : !fir.ref<i32>
 !CHECK:             %[[VAL_22:.*]] = fir.convert %[[LOAD_IV]] : (i32) -> i64
 !CHECK:             %[[VAL_23:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
@@ -109,10 +111,11 @@ end program wsloop_variable
 !CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i64) -> f32
 !CHECK:             fir.store %[[VAL_25]] to %[[VAL_6]] : !fir.ref<f32>
 !CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_20]], %[[VAL_18]] : index
-!CHECK:             fir.result %[[VAL_26]] : index
+!CHECK:             %[[STEPCAST:.*]] = fir.convert %[[VAL_18]] : (index) -> i64
+!CHECK:             %[[IVINC:.*]] = arith.addi %[[IV]], %[[STEPCAST]]
+!CHECK:             fir.result %[[VAL_26]], %[[IVINC]] : index, i64
 !CHECK:           }
-!CHECK:           %[[VAL_27:.*]] = fir.convert %[[VAL_28:.*]] : (index) -> i64
-!CHECK:           fir.store %[[VAL_27]] to %[[VAL_5]] : !fir.ref<i64>
+!CHECK:           fir.store %[[VAL_19]]#1 to %[[VAL_5]] : !fir.ref<i64>
 !CHECK:           omp.yield
 !CHECK:         }
 !CHECK:         return

diff  --git a/flang/test/Lower/array-expression-slice-1.f90 b/flang/test/Lower/array-expression-slice-1.f90
index b831234dc0644..9ce28f033cf92 100644
--- a/flang/test/Lower/array-expression-slice-1.f90
+++ b/flang/test/Lower/array-expression-slice-1.f90
@@ -25,20 +25,19 @@
 ! CHECK-DAG:         %[[VAL_30:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFEj"}
 ! CHECK-DAG:         %[[VAL_31:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFEk"}
 ! CHECK:         fir.store %[[VAL_24]] to %[[VAL_31]] : !fir.ref<i32>
-! CHECK:         br ^bb1(%[[VAL_5]], %[[VAL_0]] : index, index)
-! CHECK:       ^bb1(%[[VAL_32:.*]]: index, %[[VAL_33:.*]]: index):
+! CHECK:         %[[LB:.*]] = fir.convert %[[VAL_5]] : (index) -> i32
+! CHECK:         br ^bb1(%[[LB]], %[[VAL_0]] : i32, index)
+! CHECK:       ^bb1(%[[VAL_32:.*]]: i32, %[[VAL_33:.*]]: index):
 ! CHECK:         %[[VAL_34:.*]] = arith.cmpi sgt, %[[VAL_33]], %[[VAL_6]] : index
 ! CHECK:         cond_br %[[VAL_34]], ^bb2, ^bb6
 ! CHECK:       ^bb2:
-! CHECK:         %[[VAL_35:.*]] = fir.convert %[[VAL_32]] : (index) -> i32
-! CHECK:         fir.store %[[VAL_35]] to %[[VAL_30]] : !fir.ref<i32>
-! CHECK:         br ^bb3(%[[VAL_5]], %[[VAL_0]] : index, index)
-! CHECK:       ^bb3(%[[VAL_36:.*]]: index, %[[VAL_37:.*]]: index):
+! CHECK:         fir.store %[[VAL_32]] to %[[VAL_30]] : !fir.ref<i32>
+! CHECK:         br ^bb3(%[[LB]], %[[VAL_0]] : i32, index)
+! CHECK:       ^bb3(%[[VAL_36:.*]]: i32, %[[VAL_37:.*]]: index):
 ! CHECK:         %[[VAL_38:.*]] = arith.cmpi sgt, %[[VAL_37]], %[[VAL_6]] : index
 ! CHECK:         cond_br %[[VAL_38]], ^bb4, ^bb5
 ! CHECK:       ^bb4:
-! CHECK:         %[[VAL_39:.*]] = fir.convert %[[VAL_36]] : (index) -> i32
-! CHECK:         fir.store %[[VAL_39]] to %[[VAL_28]] : !fir.ref<i32>
+! CHECK:         fir.store %[[VAL_36]] to %[[VAL_28]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_40:.*]] = fir.load %[[VAL_31]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_41:.*]] = arith.addi %[[VAL_40]], %[[VAL_23]] : i32
 ! CHECK:         fir.store %[[VAL_41]] to %[[VAL_31]] : !fir.ref<i32>
@@ -53,12 +52,11 @@
 ! CHECK:         %[[VAL_50:.*]] = arith.subi %[[VAL_49]], %[[VAL_20]] : i64
 ! CHECK:         %[[VAL_51:.*]] = fir.coordinate_of %[[VAL_25]], %[[VAL_47]], %[[VAL_50]] : (!fir.ref<!fir.array<10x10xf32>>, i64, i64) -> !fir.ref<f32>
 ! CHECK:         fir.store %[[VAL_44]] to %[[VAL_51]] : !fir.ref<f32>
-! CHECK:         %[[VAL_52:.*]] = arith.addi %[[VAL_36]], %[[VAL_5]] : index
+! CHECK:         %[[VAL_52:.*]] = arith.addi %[[VAL_36]], %[[LB]] : i32
 ! CHECK:         %[[VAL_53:.*]] = arith.subi %[[VAL_37]], %[[VAL_5]] : index
-! CHECK:         br ^bb3(%[[VAL_52]], %[[VAL_53]] : index, index)
+! CHECK:         br ^bb3(%[[VAL_52]], %[[VAL_53]] : i32, index)
 ! CHECK:       ^bb5:
-! CHECK:         %[[VAL_54:.*]] = fir.convert %[[VAL_36]] : (index) -> i32
-! CHECK:         fir.store %[[VAL_54]] to %[[VAL_28]] : !fir.ref<i32>
+! CHECK:         fir.store %[[VAL_36]] to %[[VAL_28]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_55:.*]] = fir.load %[[VAL_31]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (i32) -> f32
 ! CHECK:         %[[VAL_57:.*]] = fir.call @fir.sin.f32.f32(%[[VAL_56]]) : (f32) -> f32
@@ -67,12 +65,11 @@
 ! CHECK:         %[[VAL_60:.*]] = arith.subi %[[VAL_59]], %[[VAL_20]] : i64
 ! CHECK:         %[[VAL_61:.*]] = fir.coordinate_of %[[VAL_27]], %[[VAL_60]] : (!fir.ref<!fir.array<10xf32>>, i64) -> !fir.ref<f32>
 ! CHECK:         fir.store %[[VAL_57]] to %[[VAL_61]] : !fir.ref<f32>
-! CHECK:         %[[VAL_62:.*]] = arith.addi %[[VAL_32]], %[[VAL_5]] : index
+! CHECK:         %[[VAL_62:.*]] = arith.addi %[[VAL_32]], %[[LB]] : i32
 ! CHECK:         %[[VAL_63:.*]] = arith.subi %[[VAL_33]], %[[VAL_5]] : index
-! CHECK:         br ^bb1(%[[VAL_62]], %[[VAL_63]] : index, index)
+! CHECK:         br ^bb1(%[[VAL_62]], %[[VAL_63]] : i32, index)
 ! CHECK:       ^bb6:
-! CHECK:         %[[VAL_64:.*]] = fir.convert %[[VAL_32]] : (index) -> i32
-! CHECK:         fir.store %[[VAL_64]] to %[[VAL_30]] : !fir.ref<i32>
+! CHECK:         fir.store %[[VAL_32]] to %[[VAL_30]] : !fir.ref<i32>
 ! CHECK:         %[[VAL_65:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1>
 ! CHECK:         %[[VAL_66:.*]] = fir.undefined index
 ! CHECK:         %[[VAL_67:.*]] = fir.shape %[[VAL_0]], %[[VAL_0]] : (index, index) -> !fir.shape<2>

diff  --git a/flang/test/Lower/do_loop.f90 b/flang/test/Lower/do_loop.f90
index 61190ec14472b..de21f0b174148 100644
--- a/flang/test/Lower/do_loop.f90
+++ b/flang/test/Lower/do_loop.f90
@@ -16,16 +16,19 @@ subroutine simple_loop
   ! CHECK: %[[C5:.*]] = arith.constant 5 : i32
   ! CHECK: %[[C5_CVT:.*]] = fir.convert %c5_i32 : (i32) -> index
   ! CHECK: %[[C1:.*]] = arith.constant 1 : index
-  ! CHECK: %[[LI_RES:.*]] = fir.do_loop %[[LI:.*]] = %[[C1_CVT]] to %[[C5_CVT]] step %[[C1]] -> index {
+  ! CHECK: %[[LB:.*]] = fir.convert %[[C1_CVT]] : (index) -> i32
+  ! CHECK: %[[LI_RES:.*]]:2 = fir.do_loop %[[LI:[^ ]*]] =
+  ! CHECK-SAME: %[[C1_CVT]] to %[[C5_CVT]] step %[[C1]]
+  ! CHECK-SAME: iter_args(%[[IV:.*]] = %[[LB]]) -> (index, i32) {
   do i=1,5
-  ! CHECK:   %[[LI_CVT:.*]] = fir.convert %[[LI]] : (index) -> i32
-  ! CHECK:   fir.store %[[LI_CVT]] to %[[I_REF]] : !fir.ref<i32>
+  ! CHECK:   fir.store %[[IV]] to %[[I_REF]] : !fir.ref<i32>
   ! CHECK:   %[[LI_NEXT:.*]] = arith.addi %[[LI]], %[[C1]] : index
-  ! CHECK:  fir.result %[[LI_NEXT]] : index
+  ! CHECK:   %[[STEPCAST:.*]] = fir.convert %[[C1]] : (index) -> i32
+  ! CHECK:   %[[IVINC:.*]] = arith.addi %[[IV]], %[[STEPCAST]] : i32
+  ! CHECK:  fir.result %[[LI_NEXT]], %[[IVINC]] : index, i32
   ! CHECK: }
   end do
-  ! CHECK: %[[LI_RES_CVT:.*]] = fir.convert %[[LI_RES]] : (index) -> i32
-  ! CHECK: fir.store %[[LI_RES_CVT]] to %[[I_REF]] : !fir.ref<i32>
+  ! CHECK: fir.store %[[LI_RES]]#1 to %[[I_REF]] : !fir.ref<i32>
   ! CHECK: %[[I:.*]] = fir.load %[[I_REF]] : !fir.ref<i32>
   ! CHECK: %{{.*}} = fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[I]]) : (!fir.ref<i8>, i32) -> i1
   print *, i
@@ -46,19 +49,23 @@ subroutine nested_loop
   ! CHECK: %[[E_I:.*]] = arith.constant 5 : i32
   ! CHECK: %[[E_I_CVT:.*]] = fir.convert %[[E_I]] : (i32) -> index
   ! CHECK: %[[ST_I:.*]] = arith.constant 1 : index
-  ! CHECK: %[[I_RES:.*]] = fir.do_loop %[[LI:.*]] = %[[S_I_CVT]] to %[[E_I_CVT]] step %[[ST_I]] -> index {
+  ! CHECK: %[[I_LB:.*]] = fir.convert %[[S_I_CVT]] : (index) -> i32
+  ! CHECK: %[[I_RES:.*]]:2 = fir.do_loop %[[LI:[^ ]*]] =
+  ! CHECK-SAME: %[[S_I_CVT]] to %[[E_I_CVT]] step %[[ST_I]]
+  ! CHECK-SAME: iter_args(%[[I_IV:.*]] = %[[I_LB]]) -> (index, i32) {
   do i=1,5
-    ! CHECK: %[[LI_CVT:.*]] = fir.convert %[[LI]] : (index) -> i32
-    ! CHECK: fir.store %[[LI_CVT]] to %[[I_REF]] : !fir.ref<i32>
+    ! CHECK: fir.store %[[I_IV]] to %[[I_REF]] : !fir.ref<i32>
     ! CHECK: %[[S_J:.*]] = arith.constant 1 : i32
     ! CHECK: %[[S_J_CVT:.*]] = fir.convert %[[S_J]] : (i32) -> index
     ! CHECK: %[[E_J:.*]] = arith.constant 5 : i32
     ! CHECK: %[[E_J_CVT:.*]] = fir.convert %[[E_J]] : (i32) -> index
     ! CHECK: %[[ST_J:.*]] = arith.constant 1 : index
-    ! CHECK: %[[J_RES:.*]] = fir.do_loop %[[LJ:.*]] = %[[S_J_CVT]] to %[[E_J_CVT]] step %[[ST_J]] -> index {
+    ! CHECK: %[[J_LB:.*]] = fir.convert %[[S_J_CVT]] : (index) -> i32
+    ! CHECK: %[[J_RES:.*]]:2 = fir.do_loop %[[LJ:[^ ]*]] =
+    ! CHECK-SAME: %[[S_J_CVT]] to %[[E_J_CVT]] step %[[ST_J]]
+    ! CHECK-SAME: iter_args(%[[J_IV:.*]] = %[[J_LB]]) -> (index, i32) {
     do j=1,5
-      ! CHECK: %[[LJ_CVT:.*]] = fir.convert %[[LJ]] : (index) -> i32
-      ! CHECK: fir.store %[[LJ_CVT]] to %[[J_REF]] : !fir.ref<i32>
+      ! CHECK: fir.store %[[J_IV]] to %[[J_REF]] : !fir.ref<i32>
       ! CHECK: %[[ASUM:.*]] = fir.load %[[ASUM_REF]] : !fir.ref<i32>
       ! CHECK: %[[I:.*]] = fir.load %[[I_REF]] : !fir.ref<i32>
       ! CHECK: %[[I_CVT:.*]] = fir.convert %[[I]] : (i32) -> i64
@@ -74,17 +81,19 @@ subroutine nested_loop
       ! CHECK: fir.store %[[ASUM_NEW]] to %[[ASUM_REF]] : !fir.ref<i32>
       asum = asum + arr(i,j)
       ! CHECK: %[[LJ_NEXT:.*]] = arith.addi %[[LJ]], %[[ST_J]] : index
-      ! CHECK: fir.result %[[LJ_NEXT]] : index
+      ! CHECK: %[[J_STEPCAST:.*]] = fir.convert %[[ST_J]] : (index) -> i32
+      ! CHECK: %[[J_IVINC:.*]] = arith.addi %[[J_IV]], %[[J_STEPCAST]] : i32
+      ! CHECK: fir.result %[[LJ_NEXT]], %[[J_IVINC]] : index, i32
     ! CHECK: }
     end do
-    ! CHECK: %[[J_RES_CVT:.*]] = fir.convert %[[J_RES]] : (index) -> i32
-    ! CHECK: fir.store %[[J_RES_CVT]] to %[[J_REF]] : !fir.ref<i32>
+    ! CHECK: fir.store %[[J_RES]]#1 to %[[J_REF]] : !fir.ref<i32>
     ! CHECK: %[[LI_NEXT:.*]] = arith.addi %[[LI]], %[[ST_I]] : index
-    ! CHECK: fir.result %[[LI_NEXT]] : index
+    ! CHECK: %[[I_STEPCAST:.*]] = fir.convert %[[ST_I]] : (index) -> i32
+    ! CHECK: %[[I_IVINC:.*]] = arith.addi %[[I_IV]], %[[I_STEPCAST]] : i32
+    ! CHECK: fir.result %[[LI_NEXT]], %[[I_IVINC]] : index, i32
   ! CHECK: }
   end do
-  ! CHECK: %[[I_RES_CVT:.*]] = fir.convert %[[I_RES]] : (index) -> i32
-  ! CHECK: fir.store %[[I_RES_CVT]] to %[[I_REF]] : !fir.ref<i32>
+  ! CHECK: fir.store %[[I_RES]]#1 to %[[I_REF]] : !fir.ref<i32>
 end subroutine
 
 ! Test a downcounting loop
@@ -99,16 +108,19 @@ subroutine down_counting_loop()
   ! CHECK: %[[C1_CVT:.*]] = fir.convert %[[C1]] : (i32) -> index
   ! CHECK: %[[CMINUS1:.*]] = arith.constant -1 : i32
   ! CHECK: %[[CMINUS1_STEP_CVT:.*]] = fir.convert %[[CMINUS1]] : (i32) -> index
-  ! CHECK: %[[I_RES:.*]] = fir.do_loop %[[LI:.*]] = %[[C5_CVT]] to %[[C1_CVT]] step %[[CMINUS1_STEP_CVT]] -> index {
+  ! CHECK: %[[I_LB:.*]] = fir.convert %[[C5_CVT]] : (index) -> i32
+  ! CHECK: %[[I_RES:.*]]:2 = fir.do_loop %[[LI:[^ ]*]] =
+  ! CHECK-SAME: %[[C5_CVT]] to %[[C1_CVT]] step %[[CMINUS1_STEP_CVT]]
+  ! CHECK-SAME: iter_args(%[[I_IV:.*]] = %[[I_LB]]) -> (index, i32) {
   do i=5,1,-1
-  ! CHECK: %[[LI_CVT:.*]] = fir.convert %[[LI]] : (index) -> i32
-  ! CHECK: fir.store %[[LI_CVT]] to %[[I_REF]] : !fir.ref<i32>
+  ! CHECK: fir.store %[[I_IV]] to %[[I_REF]] : !fir.ref<i32>
   ! CHECK: %[[LI_NEXT:.*]] = arith.addi %[[LI]], %[[CMINUS1_STEP_CVT]] : index
-  ! CHECK: fir.result %[[LI_NEXT]] : index
+  ! CHECK: %[[I_STEPCAST:.*]] = fir.convert %[[CMINUS1_STEP_CVT]] : (index) -> i32
+  ! CHECK: %[[I_IVINC:.*]] = arith.addi %[[I_IV]], %[[I_STEPCAST]] : i32
+  ! CHECK: fir.result %[[LI_NEXT]], %[[I_IVINC]] : index, i32
   ! CHECK: }
   end do
-  ! CHECK: %[[I_RES_CVT:.*]] = fir.convert %[[I_RES]] : (index) -> i32
-  ! CHECK: fir.store %[[I_RES_CVT]] to %[[I_REF]] : !fir.ref<i32>
+  ! CHECK: fir.store %[[I_RES]]#1 to %[[I_REF]] : !fir.ref<i32>
 end subroutine
 
 ! Test a general loop with a variable step
@@ -122,16 +134,19 @@ subroutine loop_with_variable_step(s,e,st)
   ! CHECK: %[[E_CVT:.*]] = fir.convert %[[E]] : (i32) -> index
   ! CHECK: %[[ST:.*]] = fir.load %[[ST_REF]] : !fir.ref<i32>
   ! CHECK: %[[ST_CVT:.*]] = fir.convert %[[ST]] : (i32) -> index
-  ! CHECK: %[[I_RES:.*]] = fir.do_loop %[[LI:.*]] = %[[S_CVT]] to %[[E_CVT]] step %[[ST_CVT]] -> index {
+  ! CHECK: %[[I_LB:.*]] = fir.convert %[[S_CVT]] : (index) -> i32
+  ! CHECK: %[[I_RES:.*]]:2 = fir.do_loop %[[LI:[^ ]*]] =
+  ! CHECK-SAME: %[[S_CVT]] to %[[E_CVT]] step %[[ST_CVT]]
+  ! CHECK-SAME: iter_args(%[[I_IV:.*]] = %[[I_LB]]) -> (index, i32) {
   do i=s,e,st
-  ! CHECK:  %[[LI_CVT:.*]] = fir.convert %[[LI]] : (index) -> i32
-  ! CHECK:  fir.store %[[LI_CVT]] to %[[I_REF]] : !fir.ref<i32>
+  ! CHECK:  fir.store %[[I_IV]] to %[[I_REF]] : !fir.ref<i32>
   ! CHECK:  %[[LI_NEXT:.*]] = arith.addi %[[LI]], %[[ST_CVT]] : index
-  ! CHECK:  fir.result %[[LI_NEXT]] : index
+  ! CHECK: %[[I_STEPCAST:.*]] = fir.convert %[[ST_CVT]] : (index) -> i32
+  ! CHECK: %[[I_IVINC:.*]] = arith.addi %[[I_IV]], %[[I_STEPCAST]] : i32
+  ! CHECK:  fir.result %[[LI_NEXT]], %[[I_IVINC]] : index, i32
   ! CHECK: }
   end do
-  ! CHECK: %[[I_RES_CVT:.*]] = fir.convert %[[I_RES]] : (index) -> i32
-  ! CHECK: fir.store %[[I_RES_CVT]] to %[[I_REF]] : !fir.ref<i32>
+  ! CHECK: fir.store %[[I_RES]]#1 to %[[I_REF]] : !fir.ref<i32>
 end subroutine
 
 ! Test usage of pointer variables as index, start, end and step variables
@@ -170,16 +185,19 @@ subroutine loop_with_pointer_variables(s,e,st)
 ! CHECK:  %[[ST_PTR:.*]] = fir.load %[[ST_PTR_REF]] : !fir.ref<!fir.ptr<i32>>
 ! CHECK:  %[[ST:.*]] = fir.load %[[ST_PTR]] : !fir.ptr<i32>
 ! CHECK:  %[[ST_CVT:.*]] = fir.convert %[[ST]] : (i32) -> index
-! CHECK:  %[[I_RES:.*]] = fir.do_loop %[[LI:.*]] = %[[S_CVT]] to %[[E_CVT]] step %[[ST_CVT]] -> index {
+! CHECK:  %[[I_LB:.*]] = fir.convert %[[S_CVT]] : (index) -> i32
+! CHECK:  %[[I_RES:.*]]:2 = fir.do_loop %[[LI:[^ ]*]] =
+! CHECK-SAME: %[[S_CVT]] to %[[E_CVT]] step %[[ST_CVT]]
+! CHECK-SAME: iter_args(%[[I_IV:.*]] = %[[I_LB]]) -> (index, i32) {
   do iptr=sptr,eptr,stptr
-! CHECK:    %[[LI_CVT:.*]] = fir.convert %[[LI]] : (index) -> i32
-! CHECK:    fir.store %[[LI_CVT]] to %[[I_PTR]] : !fir.ptr<i32>
+! CHECK:    fir.store %[[I_IV]] to %[[I_PTR]] : !fir.ptr<i32>
 ! CHECK:    %[[LI_NEXT:.*]] = arith.addi %[[LI]], %[[ST_CVT]] : index
-! CHECK:    fir.result %[[LI_NEXT]] : index
+! CHECK:    %[[I_STEPCAST:.*]] = fir.convert %[[ST_CVT]] : (index) -> i32
+! CHECK:    %[[I_IVINC:.*]] = arith.addi %[[I_IV]], %[[I_STEPCAST]] : i32
+! CHECK:    fir.result %[[LI_NEXT]], %[[I_IVINC]] : index, i32
   end do
 ! CHECK:  }
-! CHECK:  %[[I_RES_CVT:.*]] = fir.convert %[[I_RES]] : (index) -> i32
-! CHECK:  fir.store %[[I_RES_CVT:.*]] to %[[I_PTR]] : !fir.ptr<i32>
+! CHECK:  fir.store %[[I_RES]]#1 to %[[I_PTR]] : !fir.ptr<i32>
 end subroutine
 
 ! Test usage of non-default integer kind for loop control and loop index variable
@@ -196,16 +214,19 @@ subroutine loop_with_non_default_integer(s,e,st)
   ! CHECK: %[[ST_CVT:.*]] = fir.convert %[[ST]] : (i64) -> index
   integer(kind=8) :: s, e, st
 
-  ! CHECK: %[[I_RES:.*]] = fir.do_loop %[[LI:.*]] = %[[S_CVT]] to %[[E_CVT]] step %[[ST_CVT]] -> index {
+  ! CHECK: %[[I_LB:.*]] = fir.convert %[[S_CVT]] : (index) -> i64
+  ! CHECK: %[[I_RES:.*]]:2 = fir.do_loop %[[LI:[^ ]*]] =
+  ! CHECK-SAME: %[[S_CVT]] to %[[E_CVT]] step %[[ST_CVT]]
+  ! CHECK-SAME: iter_args(%[[I_IV:.*]] = %[[I_LB]]) -> (index, i64) {
   do i=s,e,st
-    ! CHECK: %[[LI_CVT:.*]] = fir.convert %[[LI]] : (index) -> i64
-    ! CHECK: fir.store %[[LI_CVT]] to %[[I_REF]] : !fir.ref<i64>
+    ! CHECK: fir.store %[[I_IV]] to %[[I_REF]] : !fir.ref<i64>
     ! CHECK: %[[LI_NEXT:.*]] = arith.addi %[[LI]], %[[ST_CVT]] : index
-    ! CHECK: fir.result %[[LI_NEXT]] : index
+    ! CHECK: %[[I_STEPCAST:.*]] = fir.convert %[[ST_CVT]] : (index) -> i64
+    ! CHECK: %[[I_IVINC:.*]] = arith.addi %[[I_IV]], %[[I_STEPCAST]] : i64
+    ! CHECK: fir.result %[[LI_NEXT]], %[[I_IVINC]] : index, i64
   end do
   ! CHECK: }
-  ! CHECK: %[[I_RES_CVT:.*]] = fir.convert %[[I_RES]] : (index) -> i64
-  ! CHECK: fir.store %[[I_RES_CVT]] to %[[I_REF]] : !fir.ref<i64>
+  ! CHECK: fir.store %[[I_RES]]#1 to %[[I_REF]] : !fir.ref<i64>
 end subroutine
 
 ! Test real loop control.

diff  --git a/flang/test/Lower/do_loop_unstructured.f90 b/flang/test/Lower/do_loop_unstructured.f90
index 22442471954ba..a8c849a488279 100644
--- a/flang/test/Lower/do_loop_unstructured.f90
+++ b/flang/test/Lower/do_loop_unstructured.f90
@@ -205,9 +205,10 @@ subroutine nested_structured_in_unstructured()
 ! CHECK:   %[[COND:.*]] = arith.cmpi sgt, %[[TRIP_VAR]], %[[ZERO]] : i32
 ! CHECK:   cf.cond_br %[[COND]], ^[[BODY:.*]], ^[[EXIT:.*]]
 ! CHECK: ^[[BODY]]:
-! CHECK:   %{{.*}} = fir.do_loop %[[J_INDEX:.*]] = %{{.*}} to %{{.*}} step %{{.*}} -> index {
-! CHECK:     %[[J_INDEX_CVT:.*]] = fir.convert %[[J_INDEX]] : (index) -> i32
-! CHECK:     fir.store %[[J_INDEX_CVT]] to %[[LOOP_VAR_J_REF]] : !fir.ref<i32>
+! CHECK:   %{{.*}} = fir.do_loop %[[J_INDEX:[^ ]*]] =
+! CHECK-SAME: %{{.*}} to %{{.*}} step %{{[^ ]*}}
+! CHECK-SAME: iter_args(%[[J_IV:.*]] = %{{.*}}) -> (index, i32) {
+! CHECK:     fir.store %[[J_IV]] to %[[LOOP_VAR_J_REF]] : !fir.ref<i32>
 ! CHECK:   }
 ! CHECK:   %[[TRIP_VAR_I:.*]] = fir.load %[[TRIP_VAR_I_REF]] : !fir.ref<i32>
 ! CHECK:   %[[C1_3:.*]] = arith.constant 1 : i32

diff  --git a/flang/test/Lower/infinite_loop.f90 b/flang/test/Lower/infinite_loop.f90
index 8c36ac105c462..e56a57979965b 100644
--- a/flang/test/Lower/infinite_loop.f90
+++ b/flang/test/Lower/infinite_loop.f90
@@ -90,14 +90,17 @@ subroutine structured_loop_in_infinite(i)
 ! CHECK:  %[[C10:.*]] = arith.constant 10 : i32
 ! CHECK:  %[[C10_INDEX:.*]] = fir.convert %[[C10]] : (i32) -> index
 ! CHECK:  %[[C1_1:.*]] = arith.constant 1 : index
-! CHECK:  %[[J_FINAL:.*]] = fir.do_loop %[[J:.*]] = %[[C1_INDEX]] to %[[C10_INDEX]] step %[[C1_1]] -> index {
-! CHECK:    %[[J_I32:.*]] = fir.convert %[[J]] : (index) -> i32
-! CHECK:    fir.store %[[J_I32]] to %[[J_REF]] : !fir.ref<i32>
+! CHECK:  %[[J_LB:.*]] = fir.convert %[[C1_INDEX]] : (index) -> i32
+! CHECK:  %[[J_FINAL:.*]]:2 = fir.do_loop %[[J:[^ ]*]] =
+! CHECK-SAME: %[[C1_INDEX]] to %[[C10_INDEX]] step %[[C1_1]]
+! CHECK-SAME: iter_args(%[[J_IV:.*]] = %[[J_LB]]) -> (index, i32) {
+! CHECK:    fir.store %[[J_IV]] to %[[J_REF]] : !fir.ref<i32>
 ! CHECK:    %[[J_NEXT:.*]] = arith.addi %[[J]], %[[C1_1]] : index
-! CHECK:    fir.result %[[J_NEXT]] : index
+! CHECK:    %[[J_STEPCAST:.*]] = fir.convert %[[C1_1]] : (index) -> i32
+! CHECK:    %[[J_IVINC:.*]] = arith.addi %[[J_IV]], %[[J_STEPCAST]] : i32
+! CHECK:    fir.result %[[J_NEXT]], %[[J_IVINC]] : index, i32
 ! CHECK:  }
-! CHECK:  %[[J_I32:.*]] = fir.convert %[[J_FINAL]] : (index) -> i32
-! CHECK:  fir.store %[[J_I32]] to %[[J_REF]] : !fir.ref<i32>
+! CHECK:  fir.store %[[J_FINAL]]#1 to %[[J_REF]] : !fir.ref<i32>
 ! CHECK:  cf.br ^[[BODY1]]
 ! CHECK: ^[[RETURN]]:
 ! CHECK:   return

diff  --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 2a95c69d6e225..febae0e8a0dd6 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -40,7 +40,7 @@ subroutine loop_test
     a(i,j,k) = a(i,j,k) + 1
   enddo
 
-  ! CHECK-COUNT-3: fir.do_loop {{[^un]*}} -> index
+  ! CHECK-COUNT-3: fir.do_loop {{[^un]*}} -> (index, i32)
   asum = 0
   do i=1,5
     do j=1,5

diff  --git a/flang/test/Lower/loops2.f90 b/flang/test/Lower/loops2.f90
index 7acd3bfc42bbc..424ee3682f1e0 100644
--- a/flang/test/Lower/loops2.f90
+++ b/flang/test/Lower/loops2.f90
@@ -15,12 +15,10 @@ subroutine test_pointer()
 ! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMtest_loop_varEi_pointer) : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:         %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:         %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
-! CHECK:         fir.do_loop %[[VAL_9:.*]] =
-! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i32
-! CHECK:           fir.store %[[VAL_10]] to %[[VAL_2]] : !fir.ptr<i32>
+! CHECK:         %[[VAL_9:.*]]:2 = fir.do_loop{{.*}}iter_args(%[[IV:.*]] = {{.*}})
+! CHECK:           fir.store %[[IV]] to %[[VAL_2]] : !fir.ptr<i32>
 ! CHECK:         }
-! CHECK:         %[[VAL_12:.*]] = fir.convert %[[VAL_13:.*]] : (index) -> i32
-! CHECK:         fir.store %[[VAL_12]] to %[[VAL_2]] : !fir.ptr<i32>
+! CHECK:         fir.store %[[VAL_9]]#1 to %[[VAL_2]] : !fir.ptr<i32>
   end subroutine
 
 ! CHECK-LABEL: func @_QMtest_loop_varPtest_allocatable
@@ -30,12 +28,10 @@ subroutine test_allocatable()
 ! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMtest_loop_varEi_allocatable) : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK:         %[[VAL_1:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.heap<i32>>>
 ! CHECK:         %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
-! CHECK:         fir.do_loop %[[VAL_9:.*]] =
-! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i32
-! CHECK:           fir.store %[[VAL_10]] to %[[VAL_2]] : !fir.heap<i32>
+! CHECK:         %[[VAL_9:.*]]:2 = fir.do_loop{{.*}}iter_args(%[[IV:.*]] = {{.*}})
+! CHECK:           fir.store %[[IV]] to %[[VAL_2]] : !fir.heap<i32>
 ! CHECK:         }
-! CHECK:         %[[VAL_12:.*]] = fir.convert %[[VAL_13:.*]] : (index) -> i32
-! CHECK:         fir.store %[[VAL_12]] to %[[VAL_2]] : !fir.heap<i32>
+! CHECK:         fir.store %[[VAL_9]]#1 to %[[VAL_2]] : !fir.heap<i32>
   end subroutine
 
 ! CHECK-LABEL: func @_QMtest_loop_varPtest_real_pointer

diff  --git a/flang/test/Lower/mixed_loops.f90 b/flang/test/Lower/mixed_loops.f90
index 3135f8a86a2ce..e1a1c8b8f458f 100644
--- a/flang/test/Lower/mixed_loops.f90
+++ b/flang/test/Lower/mixed_loops.f90
@@ -90,20 +90,23 @@ subroutine do_inside_while_loop
       ! CHECK-DAG: %[[C13_I32:.*]] = arith.constant 13 : i32
       ! CHECK-DAG: %[[C13:.*]] = fir.convert %[[C13_I32]] : (i32) -> index
       ! CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-      ! CHECK: %[[RESULT:.*]] = fir.do_loop %[[IDX:.*]] = %[[C8]] to %[[C13]] step %[[C1]] -> index {
-        ! CHECK: %[[I32:.*]] = fir.convert %[[IDX]] : (index) -> i32
-        ! CHECK: fir.store %[[I32]] to %[[I_REF]] : !fir.ref<i32>
+      ! CHECK: %[[I_LB:.*]] = fir.convert %[[C8]] : (index) -> i32
+      ! CHECK: %[[RESULT:.*]]:2 = fir.do_loop %[[IDX:[^ ]*]] =
+      ! CHECK-SAME: %[[C8]] to %[[C13]] step %[[C1]]
+      ! CHECK-SAME: iter_args(%[[I_IV:.*]] = %[[I_LB]]) -> (index, i32) {
+        ! CHECK: fir.store %[[I_IV]] to %[[I_REF]] : !fir.ref<i32>
         ! CHECK-DAG: %[[J2:.*]] = fir.load %[[J_REF]] : !fir.ref<i32>
         ! CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
         ! CHECK: %[[JINC:.*]] = arith.muli %[[C2]], %[[J2]] : i32
         ! CHECK: fir.store %[[JINC]] to %[[J_REF]] : !fir.ref<i32>
         ! CHECK: %[[IINC:.*]] = arith.addi %[[IDX]], %[[C1]] : index
-        ! CHECK: fir.result %[[IINC]] : index
+        ! CHECK: %[[I_STEPCAST:.*]] = fir.convert %[[C1]] : (index) -> i32
+        ! CHECK: %[[I_IVINC:.*]] = arith.addi %[[I_IV]], %[[I_STEPCAST]] : i32
+        ! CHECK: fir.result %[[IINC]], %[[I_IVINC]] : index, i32
       do i=8,13
         j=j*2
 
-      ! CHECK: %[[IFINAL:.*]] = fir.convert %[[RESULT]] : (index) -> i32
-      ! CHECK: fir.store %[[IFINAL]] to %[[I_REF]] : !fir.ref<i32>
+      ! CHECK: fir.store %[[RESULT]]#1 to %[[I_REF]] : !fir.ref<i32>
       end do
 
     ! CHECK: br ^[[HDR1]]