[flang-commits] [flang] a704e65 - [flang] Added alternative inlining code for hlfir.cshift. (#129176)

Mon Mar 3 09:58:24 PST 2025

Author: Slava Zakharin
Date: 2025-03-03T09:58:20-08:00
New Revision: a704e6587bfd974af053712c6da01fa04d74c31b

URL: https://github.com/llvm/llvm-project/commit/a704e6587bfd974af053712c6da01fa04d74c31b
DIFF: https://github.com/llvm/llvm-project/commit/a704e6587bfd974af053712c6da01fa04d74c31b.diff

LOG: [flang] Added alternative inlining code for hlfir.cshift. (#129176)

Flang generates slower code for `CSHIFT(CSHIFT(PTR(:,:,I),sh1,1),sh2,2)`
pattern in facerec than other compilers. The first CSHIFT can be done
as two memcpy's wrapped in a loop for the second dimension.
This does require creating a temporary array, but it seems to be faster,
than the current hlfir.elemental inlining.

I started with modifying the new index computation in
hlfir.elemental inlining: the new arith.select approach does enable
some vectorization in LLVM, but on x86 it is using gathers/scatters
and does not give much speed-up.

I also experimented with LoopBoundSplitPass
and InductiveRangeCheckElimination for a simple (not chained) CSHIFT
case, but I could not adjust them to split the loop with a condition
on the value of the IV into two loops with disjoint iteration spaces.
I thought if I could do it, I would be able to keep the hlfir.elemental
inlining mostly untouched, and then adjust the hlfir.elemental inlining
heuristics for the facerec case.

Since I was not able to make these pass work for me, I added a special
case inlining for CSHIFT(ARRAY,SH,DIM=1) via hlfir.eval_in_mem.
If ARRAY is not statically known to have the contiguous leading
dimension, there is a dynamic check for contiguity, which allows
exposing it to LLVM and enabling the rewrite of the copy loops
into memcpys. This approach is stepping on the toes of LoopVersioning,
but it is helpful in facerec case.

I measured ~6% speed-up on grace, and ~4% on zen4.

Added: 
    

Modified: 
    flang/include/flang/Optimizer/Builder/HLFIRTools.h
    flang/lib/Optimizer/Builder/HLFIRTools.cpp
    flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
    flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 8b1235b50cc6f..1faf451e8b495 100644

--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -517,6 +517,19 @@ Entity loadElementAt(mlir::Location loc, fir::FirOpBuilder &builder,
 llvm::SmallVector<mlir::Value, Fortran::common::maxRank>
 genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder, Entity entity);
 
+/// Generate an hlfir.designate that produces an 1D section
+/// of \p array using \p oneBasedIndices and \p dim:
+///   i = oneBasedIndices
+///   result => array(i(1), ..., i(dim-1), :, i(dim+1), ..., i(n))
+///
+/// The caller provides the pre-computed \p lbounds, \p extents
+/// and \p typeParams of the array.
+Entity gen1DSection(mlir::Location loc, fir::FirOpBuilder &builder,
+                    Entity array, int64_t dim,
+                    mlir::ArrayRef<mlir::Value> lbounds,
+                    mlir::ArrayRef<mlir::Value> extents,
+                    mlir::ValueRange oneBasedIndices,
+                    mlir::ArrayRef<mlir::Value> typeParams);
 } // namespace hlfir
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H

diff  --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 8993065c2bb64..f4967ed3852b9 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1535,3 +1535,52 @@ hlfir::genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder,
     shape.getDefiningOp()->erase();
   return extents;
 }
+
+hlfir::Entity hlfir::gen1DSection(mlir::Location loc,
+                                  fir::FirOpBuilder &builder,
+                                  hlfir::Entity array, int64_t dim,
+                                  mlir::ArrayRef<mlir::Value> lbounds,
+                                  mlir::ArrayRef<mlir::Value> extents,
+                                  mlir::ValueRange oneBasedIndices,
+                                  mlir::ArrayRef<mlir::Value> typeParams) {
+  assert(array.isVariable() && "array must be a variable");
+  assert(dim > 0 && dim <= array.getRank() && "invalid dim number");
+  mlir::Value one =
+      builder.createIntegerConstant(loc, builder.getIndexType(), 1);
+  hlfir::DesignateOp::Subscripts subscripts;
+  unsigned indexId = 0;
+  for (int i = 0; i < array.getRank(); ++i) {
+    if (i == dim - 1) {
+      mlir::Value ubound = genUBound(loc, builder, lbounds[i], extents[i], one);
+      subscripts.emplace_back(
+          hlfir::DesignateOp::Triplet{lbounds[i], ubound, one});
+    } else {
+      mlir::Value index =
+          genUBound(loc, builder, lbounds[i], oneBasedIndices[indexId++], one);
+      subscripts.emplace_back(index);
+    }
+  }
+  mlir::Value sectionShape =
+      builder.create<fir::ShapeOp>(loc, extents[dim - 1]);
+
+  // The result type is one of:
+  //   !fir.box/class<!fir.array<NxT>>
+  //   !fir.box/class<!fir.array<?xT>>
+  //
+  // We could use !fir.ref<!fir.array<NxT>> when the whole dimension's
+  // size is known and it is the leading dimension, but let it be simple
+  // for the time being.
+  auto seqType =
+      mlir::cast<fir::SequenceType>(array.getElementOrSequenceType());
+  int64_t dimExtent = seqType.getShape()[dim - 1];
+  mlir::Type sectionType =
+      fir::SequenceType::get({dimExtent}, seqType.getEleTy());
+  sectionType = fir::wrapInClassOrBoxType(sectionType, array.isPolymorphic());
+
+  auto designate = builder.create<hlfir::DesignateOp>(
+      loc, sectionType, array, /*component=*/"", /*componentShape=*/nullptr,
+      subscripts,
+      /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
+      sectionShape, typeParams);
+  return hlfir::Entity{designate.getResult()};
+}

diff  --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index c1c3839c47e11..bac10121a881b 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -415,15 +415,13 @@ class SumAsElementalConversion : public mlir::OpRewritePattern<hlfir::SumOp> {
   }
 };
 
-class CShiftAsElementalConversion
-    : public mlir::OpRewritePattern<hlfir::CShiftOp> {
+class CShiftConversion : public mlir::OpRewritePattern<hlfir::CShiftOp> {
 public:
   using mlir::OpRewritePattern<hlfir::CShiftOp>::OpRewritePattern;
 
   llvm::LogicalResult
   matchAndRewrite(hlfir::CShiftOp cshift,
                   mlir::PatternRewriter &rewriter) const override {
-    using Fortran::common::maxRank;
 
     hlfir::ExprType expr = mlir::dyn_cast<hlfir::ExprType>(cshift.getType());
     assert(expr &&
@@ -445,31 +443,88 @@ class CShiftAsElementalConversion
     if (dimVal <= 0 || dimVal > arrayRank)
       return rewriter.notifyMatchFailure(cshift, "Invalid DIM for CSHIFT");
 
-    mlir::Location loc = cshift.getLoc();
-    fir::FirOpBuilder builder{rewriter, cshift.getOperation()};
-    mlir::Type elementType = expr.getElementType();
+    // When DIM==1 and the contiguity of the input array is not statically
+    // known, try to exploit the fact that the leading dimension might be
+    // contiguous. We can do this now using hlfir.eval_in_mem with
+    // a dynamic check for the leading dimension contiguity.
+    // Otherwise, convert hlfir.cshift to hlfir.elemental.
+    //
+    // Note that the hlfir.elemental can be inlined into other hlfir.elemental,
+    // while hlfir.eval_in_mem prevents this, and we will end up creating
+    // a temporary array for the result. We may need to come up with
+    // a more sophisticated logic for picking the most efficient
+    // representation.
     hlfir::Entity array = hlfir::Entity{cshift.getArray()};
-    mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
-    llvm::SmallVector<mlir::Value> arrayExtents =
-        hlfir::getExplicitExtentsFromShape(arrayShape, builder);
-    llvm::SmallVector<mlir::Value, 1> typeParams;
-    hlfir::genLengthParameters(loc, builder, array, typeParams);
+    mlir::Type elementType = array.getFortranElementType();
+    if (dimVal == 1 && fir::isa_trivial(elementType) &&
+        // genInMemCShift() only works for variables currently.
+        array.isVariable())
+      rewriter.replaceOp(cshift, genInMemCShift(rewriter, cshift, dimVal));
+    else
+      rewriter.replaceOp(cshift, genElementalCShift(rewriter, cshift, dimVal));
+    return mlir::success();
+  }
+
+private:
+  /// Generate MODULO(\p shiftVal, \p extent).
+  static mlir::Value normalizeShiftValue(mlir::Location loc,
+                                         fir::FirOpBuilder &builder,
+                                         mlir::Value shiftVal,
+                                         mlir::Value extent,
+                                         mlir::Type calcType) {
+    shiftVal = builder.createConvert(loc, calcType, shiftVal);
+    extent = builder.createConvert(loc, calcType, extent);
+    // Make sure that we do not divide by zero. When the dimension
+    // has zero size, turn the extent into 1. Note that the computed
+    // MODULO value won't be used in this case, so it does not matter
+    // which extent value we use.
+    mlir::Value zero = builder.createIntegerConstant(loc, calcType, 0);
+    mlir::Value one = builder.createIntegerConstant(loc, calcType, 1);
+    mlir::Value isZero = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::eq, extent, zero);
+    extent = builder.create<mlir::arith::SelectOp>(loc, isZero, one, extent);
+    shiftVal = fir::IntrinsicLibrary{builder, loc}.genModulo(
+        calcType, {shiftVal, extent});
+    return builder.createConvert(loc, calcType, shiftVal);
+  }
+
+  /// Convert \p cshift into an hlfir.elemental using
+  /// the pre-computed constant \p dimVal.
+  static mlir::Operation *genElementalCShift(mlir::PatternRewriter &rewriter,
+                                             hlfir::CShiftOp cshift,
+                                             int64_t dimVal) {
+    using Fortran::common::maxRank;
     hlfir::Entity shift = hlfir::Entity{cshift.getShift()};
+    hlfir::Entity array = hlfir::Entity{cshift.getArray()};
+
+    mlir::Location loc = cshift.getLoc();
+    fir::FirOpBuilder builder{rewriter, cshift.getOperation()};
     // The new index computation involves MODULO, which is not implemented
     // for IndexType, so use I64 instead.
     mlir::Type calcType = builder.getI64Type();
+    // All the indices arithmetic used below does not overflow
+    // signed and unsigned I64.
+    builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw |
+                                    mlir::arith::IntegerOverflowFlags::nuw);
 
-    mlir::Value one = builder.createIntegerConstant(loc, calcType, 1);
+    mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
+    llvm::SmallVector<mlir::Value, maxRank> arrayExtents =
+        hlfir::getExplicitExtentsFromShape(arrayShape, builder);
+    llvm::SmallVector<mlir::Value, 1> typeParams;
+    hlfir::genLengthParameters(loc, builder, array, typeParams);
+    mlir::Value shiftDimExtent =
+        builder.createConvert(loc, calcType, arrayExtents[dimVal - 1]);
     mlir::Value shiftVal;
     if (shift.isScalar()) {
       shiftVal = hlfir::loadTrivialScalar(loc, builder, shift);
-      shiftVal = builder.createConvert(loc, calcType, shiftVal);
+      shiftVal =
+          normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType);
     }
 
     auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                          mlir::ValueRange inputIndices) -> hlfir::Entity {
       llvm::SmallVector<mlir::Value, maxRank> indices{inputIndices};
-      if (!shift.isScalar()) {
+      if (!shiftVal) {
         // When the array is not a vector, section
         // (s(1), s(2), ..., s(dim-1), :, s(dim+1), ..., s(n)
         // of the result has a value equal to:
@@ -482,35 +537,281 @@ class CShiftAsElementalConversion
         hlfir::Entity shiftElement =
             hlfir::getElementAt(loc, builder, shift, shiftIndices);
         shiftVal = hlfir::loadTrivialScalar(loc, builder, shiftElement);
-        shiftVal = builder.createConvert(loc, calcType, shiftVal);
+        shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent,
+                                       calcType);
       }
 
       // Element i of the result (1-based) is element
-      // 'MODULO(i + SH - 1, SIZE(ARRAY)) + 1' (1-based) of the original
+      // 'MODULO(i + SH - 1, SIZE(ARRAY,DIM)) + 1' (1-based) of the original
       // ARRAY (or its section, when ARRAY is not a vector).
+
+      // Compute the index into the original array using the normalized
+      // shift value, which satisfies (SH >= 0 && SH < SIZE(ARRAY,DIM)):
+      //   newIndex =
+      //     i + ((i <= SIZE(ARRAY,DIM) - SH) ? SH : SH - SIZE(ARRAY,DIM))
+      //
+      // Such index computation allows for further loop vectorization
+      // in LLVM.
+      mlir::Value wrapBound =
+          builder.create<mlir::arith::SubIOp>(loc, shiftDimExtent, shiftVal);
+      mlir::Value adjustedShiftVal =
+          builder.create<mlir::arith::SubIOp>(loc, shiftVal, shiftDimExtent);
       mlir::Value index =
           builder.createConvert(loc, calcType, inputIndices[dimVal - 1]);
-      mlir::Value extent = arrayExtents[dimVal - 1];
+      mlir::Value wrapCheck = builder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::sle, index, wrapBound);
+      mlir::Value actualShift = builder.create<mlir::arith::SelectOp>(
+          loc, wrapCheck, shiftVal, adjustedShiftVal);
       mlir::Value newIndex =
-          builder.create<mlir::arith::AddIOp>(loc, index, shiftVal);
-      newIndex = builder.create<mlir::arith::SubIOp>(loc, newIndex, one);
-      newIndex = fir::IntrinsicLibrary{builder, loc}.genModulo(
-          calcType, {newIndex, builder.createConvert(loc, calcType, extent)});
-      newIndex = builder.create<mlir::arith::AddIOp>(loc, newIndex, one);
+          builder.create<mlir::arith::AddIOp>(loc, index, actualShift);
       newIndex = builder.createConvert(loc, builder.getIndexType(), newIndex);
-
       indices[dimVal - 1] = newIndex;
       hlfir::Entity element = hlfir::getElementAt(loc, builder, array, indices);
       return hlfir::loadTrivialScalar(loc, builder, element);
     };
 
+    mlir::Type elementType = array.getFortranElementType();
     hlfir::ElementalOp elementalOp = hlfir::genElementalOp(
         loc, builder, elementType, arrayShape, typeParams, genKernel,
         /*isUnordered=*/true,
         array.isPolymorphic() ? static_cast<mlir::Value>(array) : nullptr,
         cshift.getResult().getType());
-    rewriter.replaceOp(cshift, elementalOp);
-    return mlir::success();
+    return elementalOp.getOperation();
+  }
+
+  /// Convert \p cshift into an hlfir.eval_in_mem using the pre-computed
+  /// constant \p dimVal.
+  /// The converted code looks like this:
+  ///   do i=1,SH
+  ///     result(i + (SIZE(ARRAY,DIM) - SH)) = array(i)
+  ///   end
+  ///   do i=1,SIZE(ARRAY,DIM) - SH
+  ///     result(i) = array(i + SH)
+  ///   end
+  ///
+  /// When \p dimVal is 1, we generate the same code twice
+  /// under a dynamic check for the contiguity of the leading
+  /// dimension. In the code corresponding to the contiguous
+  /// leading dimension, the shift dimension is represented
+  /// as a contiguous slice of the original array.
+  /// This allows recognizing the above two loops as memcpy
+  /// loop idioms in LLVM.
+  static mlir::Operation *genInMemCShift(mlir::PatternRewriter &rewriter,
+                                         hlfir::CShiftOp cshift,
+                                         int64_t dimVal) {
+    using Fortran::common::maxRank;
+    hlfir::Entity shift = hlfir::Entity{cshift.getShift()};
+    hlfir::Entity array = hlfir::Entity{cshift.getArray()};
+    assert(array.isVariable() && "array must be a variable");
+    assert(!array.isPolymorphic() &&
+           "genInMemCShift does not support polymorphic types");
+    mlir::Location loc = cshift.getLoc();
+    fir::FirOpBuilder builder{rewriter, cshift.getOperation()};
+    // The new index computation involves MODULO, which is not implemented
+    // for IndexType, so use I64 instead.
+    mlir::Type calcType = builder.getI64Type();
+    // All the indices arithmetic used below does not overflow
+    // signed and unsigned I64.
+    builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw |
+                                    mlir::arith::IntegerOverflowFlags::nuw);
+
+    mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
+    llvm::SmallVector<mlir::Value, maxRank> arrayExtents =
+        hlfir::getExplicitExtentsFromShape(arrayShape, builder);
+    llvm::SmallVector<mlir::Value, 1> typeParams;
+    hlfir::genLengthParameters(loc, builder, array, typeParams);
+    mlir::Value shiftDimExtent =
+        builder.createConvert(loc, calcType, arrayExtents[dimVal - 1]);
+    mlir::Value shiftVal;
+    if (shift.isScalar()) {
+      shiftVal = hlfir::loadTrivialScalar(loc, builder, shift);
+      shiftVal =
+          normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType);
+    }
+
+    hlfir::EvaluateInMemoryOp evalOp =
+        builder.create<hlfir::EvaluateInMemoryOp>(
+            loc, mlir::cast<hlfir::ExprType>(cshift.getType()), arrayShape);
+    builder.setInsertionPointToStart(&evalOp.getBody().front());
+
+    mlir::Value resultArray = evalOp.getMemory();
+    mlir::Type arrayType = fir::dyn_cast_ptrEleTy(resultArray.getType());
+    resultArray = builder.createBox(loc, fir::BoxType::get(arrayType),
+                                    resultArray, arrayShape, /*slice=*/nullptr,
+                                    typeParams, /*tdesc=*/nullptr);
+
+    // This is a generator of the dimension shift code.
+    // The code is inserted inside a loop nest over the other dimensions
+    // (if any). If exposeContiguity is true, the array's section
+    // array(s(1), ..., s(dim-1), :, s(dim+1), ..., s(n)) is represented
+    // as a contiguous 1D array.
+    // shiftVal is the normalized shift value that satisfies (SH >= 0 && SH <
+    // SIZE(ARRAY,DIM)).
+    //
+    auto genDimensionShift = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                                 mlir::Value shiftVal, bool exposeContiguity,
+                                 mlir::ValueRange oneBasedIndices)
+        -> llvm::SmallVector<mlir::Value, 0> {
+      // Create a vector of indices (s(1), ..., s(dim-1), nullptr, s(dim+1),
+      // ..., s(n)) so that we can update the dimVal index as needed.
+      llvm::SmallVector<mlir::Value, maxRank> srcIndices(
+          oneBasedIndices.begin(), oneBasedIndices.begin() + (dimVal - 1));
+      srcIndices.push_back(nullptr);
+      srcIndices.append(oneBasedIndices.begin() + (dimVal - 1),
+                        oneBasedIndices.end());
+      llvm::SmallVector<mlir::Value, maxRank> dstIndices(srcIndices);
+
+      hlfir::Entity srcArray = array;
+      if (exposeContiguity && mlir::isa<fir::BaseBoxType>(srcArray.getType())) {
+        assert(dimVal == 1 && "can expose contiguity only for dim 1");
+        llvm::SmallVector<mlir::Value, maxRank> arrayLbounds =
+            hlfir::genLowerbounds(loc, builder, arrayShape, array.getRank());
+        hlfir::Entity section =
+            hlfir::gen1DSection(loc, builder, srcArray, dimVal, arrayLbounds,
+                                arrayExtents, oneBasedIndices, typeParams);
+        mlir::Value addr = hlfir::genVariableRawAddress(loc, builder, section);
+        mlir::Value shape = hlfir::genShape(loc, builder, section);
+        mlir::Type boxType = fir::wrapInClassOrBoxType(
+            hlfir::getFortranElementOrSequenceType(section.getType()),
+            section.isPolymorphic());
+        srcArray = hlfir::Entity{
+            builder.createBox(loc, boxType, addr, shape, /*slice=*/nullptr,
+                              /*lengths=*/{}, /*tdesc=*/nullptr)};
+        // When shifting the dimension as a 1D section of the original
+        // array, we only need one index for addressing.
+        srcIndices.resize(1);
+      }
+
+      // Copy first portion of the array:
+      // do i=1,SH
+      //   result(i + (SIZE(ARRAY,DIM) - SH)) = array(i)
+      // end
+      auto genAssign1 = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                            mlir::ValueRange index,
+                            mlir::ValueRange reductionArgs)
+          -> llvm::SmallVector<mlir::Value, 0> {
+        assert(index.size() == 1 && "expected single loop");
+        mlir::Value srcIndex = builder.createConvert(loc, calcType, index[0]);
+        srcIndices[dimVal - 1] = srcIndex;
+        hlfir::Entity srcElementValue =
+            hlfir::loadElementAt(loc, builder, srcArray, srcIndices);
+        mlir::Value dstIndex = builder.create<mlir::arith::AddIOp>(
+            loc, srcIndex,
+            builder.create<mlir::arith::SubIOp>(loc, shiftDimExtent, shiftVal));
+        dstIndices[dimVal - 1] = dstIndex;
+        hlfir::Entity dstElement = hlfir::getElementAt(
+            loc, builder, hlfir::Entity{resultArray}, dstIndices);
+        builder.create<hlfir::AssignOp>(loc, srcElementValue, dstElement);
+        return {};
+      };
+
+      // Generate the first loop.
+      hlfir::genLoopNestWithReductions(loc, builder, {shiftVal},
+                                       /*reductionInits=*/{}, genAssign1,
+                                       /*isUnordered=*/true);
+
+      // Copy second portion of the array:
+      // do i=1,SIZE(ARRAY,DIM)-SH
+      //   result(i) = array(i + SH)
+      // end
+      auto genAssign2 = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                            mlir::ValueRange index,
+                            mlir::ValueRange reductionArgs)
+          -> llvm::SmallVector<mlir::Value, 0> {
+        assert(index.size() == 1 && "expected single loop");
+        mlir::Value dstIndex = builder.createConvert(loc, calcType, index[0]);
+        mlir::Value srcIndex =
+            builder.create<mlir::arith::AddIOp>(loc, dstIndex, shiftVal);
+        srcIndices[dimVal - 1] = srcIndex;
+        hlfir::Entity srcElementValue =
+            hlfir::loadElementAt(loc, builder, srcArray, srcIndices);
+        dstIndices[dimVal - 1] = dstIndex;
+        hlfir::Entity dstElement = hlfir::getElementAt(
+            loc, builder, hlfir::Entity{resultArray}, dstIndices);
+        builder.create<hlfir::AssignOp>(loc, srcElementValue, dstElement);
+        return {};
+      };
+
+      // Generate the second loop.
+      mlir::Value bound =
+          builder.create<mlir::arith::SubIOp>(loc, shiftDimExtent, shiftVal);
+      hlfir::genLoopNestWithReductions(loc, builder, {bound},
+                                       /*reductionInits=*/{}, genAssign2,
+                                       /*isUnordered=*/true);
+      return {};
+    };
+
+    // A wrapper around genDimensionShift that computes the normalized
+    // shift value and manages the insertion of the multiple versions
+    // of the shift based on the dynamic check of the leading dimension's
+    // contiguity (when dimVal == 1).
+    auto genShiftBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                            mlir::ValueRange oneBasedIndices,
+                            mlir::ValueRange reductionArgs)
+        -> llvm::SmallVector<mlir::Value, 0> {
+      // Copy the dimension with a shift:
+      // SH is either SHIFT (if scalar) or SHIFT(oneBasedIndices).
+      if (!shiftVal) {
+        assert(!oneBasedIndices.empty() && "scalar shift must be precomputed");
+        hlfir::Entity shiftElement =
+            hlfir::getElementAt(loc, builder, shift, oneBasedIndices);
+        shiftVal = hlfir::loadTrivialScalar(loc, builder, shiftElement);
+        shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent,
+                                       calcType);
+      }
+
+      // If we can fetch the byte stride of the leading dimension,
+      // and the byte size of the element, then we can generate
+      // a dynamic contiguity check and expose the leading dimension's
+      // contiguity in FIR, making memcpy loop idiom recognition
+      // possible.
+      mlir::Value elemSize;
+      mlir::Value stride;
+      if (dimVal == 1 && mlir::isa<fir::BaseBoxType>(array.getType())) {
+        mlir::Type indexType = builder.getIndexType();
+        elemSize =
+            builder.create<fir::BoxEleSizeOp>(loc, indexType, array.getBase());
+        mlir::Value dimIdx =
+            builder.createIntegerConstant(loc, indexType, dimVal - 1);
+        auto boxDim = builder.create<fir::BoxDimsOp>(
+            loc, indexType, indexType, indexType, array.getBase(), dimIdx);
+        stride = boxDim.getByteStride();
+      }
+
+      if (array.isSimplyContiguous() || !elemSize || !stride) {
+        genDimensionShift(loc, builder, shiftVal, /*exposeContiguity=*/false,
+                          oneBasedIndices);
+        return {};
+      }
+
+      mlir::Value isContiguous = builder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::eq, elemSize, stride);
+      builder.genIfOp(loc, {}, isContiguous, /*withElseRegion=*/true)
+          .genThen([&]() {
+            genDimensionShift(loc, builder, shiftVal, /*exposeContiguity=*/true,
+                              oneBasedIndices);
+          })
+          .genElse([&]() {
+            genDimensionShift(loc, builder, shiftVal,
+                              /*exposeContiguity=*/false, oneBasedIndices);
+          });
+
+      return {};
+    };
+
+    // For 1D case, generate a single loop.
+    // For ND case, generate a loop nest over the other dimensions
+    // with a single loop inside (generated separately).
+    llvm::SmallVector<mlir::Value, maxRank> newExtents(arrayExtents);
+    newExtents.erase(newExtents.begin() + (dimVal - 1));
+    if (!newExtents.empty())
+      hlfir::genLoopNestWithReductions(loc, builder, newExtents,
+                                       /*reductionInits=*/{}, genShiftBody,
+                                       /*isUnordered=*/true);
+    else
+      genShiftBody(loc, builder, {}, {});
+
+    return evalOp.getOperation();
   }
 };
 
@@ -1181,7 +1482,7 @@ class SimplifyHLFIRIntrinsics
     mlir::RewritePatternSet patterns(context);
     patterns.insert<TransposeAsElementalConversion>(context);
     patterns.insert<SumAsElementalConversion>(context);
-    patterns.insert<CShiftAsElementalConversion>(context);
+    patterns.insert<CShiftConversion>(context);
     patterns.insert<MatmulConversion<hlfir::MatmulTransposeOp>>(context);
 
     // If forceMatmulAsElemental is false, then hlfir.matmul inlining

diff  --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir
index d21d7755062ba..35530c66f4038 100644
--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir
@@ -1,4 +1,4 @@
-// Test hlfir.cshift simplification to hlfir.elemental:
+// Test hlfir.cshift simplification to hlfir.elemental and hlfir.eval_in_mem:
 // RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s
 
 func.func @cshift_vector(%arg0: !fir.box<!fir.array<?xi32>>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?xi32>{
@@ -6,39 +6,114 @@ func.func @cshift_vector(%arg0: !fir.box<!fir.array<?xi32>>, %arg1: !fir.ref<i32
   return %res : !hlfir.expr<?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_vector(
-// CHECK-SAME:                             %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>,
-// CHECK-SAME:                             %[[VAL_1:.*]]: !fir.ref<i32>) -> !hlfir.expr<?xi32> {
-// CHECK:           %[[VAL_26:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_16:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i32) -> i64
-// CHECK:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
-// CHECK:           ^bb0(%[[VAL_9:.*]]: index):
-// CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
-// CHECK:             %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_7]] : i64
-// CHECK:             %[[VAL_12:.*]] = arith.subi %[[VAL_11]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_3]]#1 : (index) -> i64
-// CHECK:             %[[VAL_14:.*]] = arith.remsi %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_15:.*]] = arith.xori %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.cmpi ne, %[[VAL_14]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_19:.*]] = arith.andi %[[VAL_18]], %[[VAL_17]] : i1
-// CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_14]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_20]], %[[VAL_14]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i64) -> index
-// CHECK:             %[[VAL_25:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_27:.*]] = arith.subi %[[VAL_25]]#0, %[[VAL_26]] : index
-// CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_23]], %[[VAL_27]] : index
-// CHECK:             %[[VAL_29:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_28]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_30:.*]] = fir.load %[[VAL_29]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_30]] : i32
+// CHECK-SAME:                             %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>>,
+// CHECK-SAME:                             %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32>) -> !hlfir.expr<?xi32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
+// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_4]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[VAL_3]], %[[VAL_8]] : i64
+// CHECK:           %[[VAL_11:.*]] = arith.remsi %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_12:.*]] = arith.xori %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_4]] : i64
+// CHECK:           %[[VAL_14:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_4]] : i64
+// CHECK:           %[[VAL_15:.*]] = arith.andi %[[VAL_14]], %[[VAL_13]] : i1
+// CHECK:           %[[VAL_16:.*]] = arith.addi %[[VAL_11]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_15]], %[[VAL_16]], %[[VAL_11]] : i64
+// CHECK:           %[[VAL_18:.*]] = hlfir.eval_in_mem shape %[[VAL_7]] : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+// CHECK:           ^bb0(%[[VAL_19:.*]]: !fir.ref<!fir.array<?xi32>>):
+// CHECK:             %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_7]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:             %[[VAL_3:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?xi32>>) -> index
+// CHECK:             %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+
+// CHECK:             %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_21]]#2 : index
+// CHECK:             fir.if %[[VAL_22]] {
+// CHECK:               %[[VAL_23:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:               %[[VAL_24:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]:%[[VAL_6]]#1:%[[VAL_2]])  shape %[[VAL_23]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:               %[[VAL_25:.*]] = fir.box_addr %[[VAL_24]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:               %[[VAL_27:.*]] = fir.convert %[[VAL_17]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_28:.*]] = %[[VAL_2]] to %[[VAL_27]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (index) -> i64
+// CHECK:                 %[[VAL_30:.*]]:3 = fir.box_dims %[[VAL_26]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_31:.*]] = fir.convert %[[VAL_29]] : (i64) -> index
+// CHECK:                 %[[VAL_32:.*]] = arith.subi %[[VAL_30]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_33:.*]] = arith.addi %[[VAL_31]], %[[VAL_32]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_34:.*]] = hlfir.designate %[[VAL_26]] (%[[VAL_33]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_36:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_37:.*]] = arith.addi %[[VAL_29]], %[[VAL_36]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_38:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_39:.*]] = fir.convert %[[VAL_37]] : (i64) -> index
+// CHECK:                 %[[VAL_40:.*]] = arith.subi %[[VAL_38]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_41:.*]] = arith.addi %[[VAL_39]], %[[VAL_40]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_42:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_41]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_35]] to %[[VAL_42]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:               %[[VAL_43:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_43]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_45:.*]] = %[[VAL_2]] to %[[VAL_44]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_46:.*]] = fir.convert %[[VAL_45]] : (index) -> i64
+// CHECK:                 %[[VAL_47:.*]] = arith.addi %[[VAL_46]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_48:.*]]:3 = fir.box_dims %[[VAL_26]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (i64) -> index
+// CHECK:                 %[[VAL_50:.*]] = arith.subi %[[VAL_48]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_51:.*]] = arith.addi %[[VAL_49]], %[[VAL_50]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_52:.*]] = hlfir.designate %[[VAL_26]] (%[[VAL_51]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_53:.*]] = fir.load %[[VAL_52]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_54:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_55:.*]] = fir.convert %[[VAL_46]] : (i64) -> index
+// CHECK:                 %[[VAL_56:.*]] = arith.subi %[[VAL_54]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_57:.*]] = arith.addi %[[VAL_55]], %[[VAL_56]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_58:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_57]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_53]] to %[[VAL_58]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:             } else {
+// CHECK:               %[[VAL_59:.*]] = fir.convert %[[VAL_17]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_60:.*]] = %[[VAL_2]] to %[[VAL_59]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_61:.*]] = fir.convert %[[VAL_60]] : (index) -> i64
+// CHECK:                 %[[VAL_62:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_63:.*]] = fir.convert %[[VAL_61]] : (i64) -> index
+// CHECK:                 %[[VAL_64:.*]] = arith.subi %[[VAL_62]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_65:.*]] = arith.addi %[[VAL_63]], %[[VAL_64]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_66:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_65]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_67:.*]] = fir.load %[[VAL_66]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_68:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_69:.*]] = arith.addi %[[VAL_61]], %[[VAL_68]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_70:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_71:.*]] = fir.convert %[[VAL_69]] : (i64) -> index
+// CHECK:                 %[[VAL_72:.*]] = arith.subi %[[VAL_70]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_73:.*]] = arith.addi %[[VAL_71]], %[[VAL_72]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_74:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_73]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_67]] to %[[VAL_74]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:               %[[VAL_75:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:               %[[VAL_76:.*]] = fir.convert %[[VAL_75]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_77:.*]] = %[[VAL_2]] to %[[VAL_76]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_78:.*]] = fir.convert %[[VAL_77]] : (index) -> i64
+// CHECK:                 %[[VAL_79:.*]] = arith.addi %[[VAL_78]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_80:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_81:.*]] = fir.convert %[[VAL_79]] : (i64) -> index
+// CHECK:                 %[[VAL_82:.*]] = arith.subi %[[VAL_80]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_83:.*]] = arith.addi %[[VAL_81]], %[[VAL_82]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_84:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_83]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_85:.*]] = fir.load %[[VAL_84]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_86:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_87:.*]] = fir.convert %[[VAL_78]] : (i64) -> index
+// CHECK:                 %[[VAL_88:.*]] = arith.subi %[[VAL_86]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_89:.*]] = arith.addi %[[VAL_87]], %[[VAL_88]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_90:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_89]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_85]] to %[[VAL_90]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:             }
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_18]] : !hlfir.expr<?xi32>
 // CHECK:         }
 
 func.func @cshift_2d_by_scalar(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?x?xi32> {
@@ -47,43 +122,47 @@ func.func @cshift_2d_by_scalar(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir
   return %res : !hlfir.expr<?x?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_2d_by_scalar(
-// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>>,
-// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.ref<i32>) -> !hlfir.expr<?x?xi32> {
-// CHECK:           %[[VAL_20:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_4]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK-SAME:                                   %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?x?xi32>>,
+// CHECK-SAME:                                   %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32>) -> !hlfir.expr<?x?xi32> {
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
 // CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
 // CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
-// CHECK:           %[[VAL_11:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
-// CHECK:           ^bb0(%[[VAL_12:.*]]: index, %[[VAL_13:.*]]: index):
-// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (index) -> i64
-// CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_10]] : i64
-// CHECK:             %[[VAL_16:.*]] = arith.subi %[[VAL_15]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
-// CHECK:             %[[VAL_18:.*]] = arith.remsi %[[VAL_16]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_19:.*]] = arith.xori %[[VAL_16]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_20]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_20]] : i64
-// CHECK:             %[[VAL_23:.*]] = arith.andi %[[VAL_22]], %[[VAL_21]] : i1
-// CHECK:             %[[VAL_24:.*]] = arith.addi %[[VAL_18]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_25:.*]] = arith.select %[[VAL_23]], %[[VAL_24]], %[[VAL_18]] : i64
-// CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_25]], %[[VAL_8]] : i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_2]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_8]] : i64
+// CHECK:           %[[VAL_11:.*]] = arith.remsi %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_12:.*]] = arith.xori %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_14:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_15:.*]] = arith.andi %[[VAL_14]], %[[VAL_13]] : i1
+// CHECK:           %[[VAL_16:.*]] = arith.addi %[[VAL_11]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_15]], %[[VAL_16]], %[[VAL_11]] : i64
+// CHECK:           %[[VAL_18:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
+// CHECK:           ^bb0(%[[VAL_19:.*]]: index, %[[VAL_20:.*]]: index):
+// CHECK:             %[[VAL_21:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_22:.*]] = arith.subi %[[VAL_17]], %[[VAL_8]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_20]] : (index) -> i64
+// CHECK:             %[[VAL_24:.*]] = arith.cmpi sle, %[[VAL_23]], %[[VAL_21]] : i64
+// CHECK:             %[[VAL_25:.*]] = arith.select %[[VAL_24]], %[[VAL_17]], %[[VAL_22]] : i64
+// CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_23]], %[[VAL_25]] overflow<nsw, nuw> : i64
 // CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (i64) -> index
+// CHECK:             %[[VAL_28:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
 // CHECK:             %[[VAL_29:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_33:.*]] = arith.subi %[[VAL_29]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_34:.*]] = arith.addi %[[VAL_12]], %[[VAL_33]] : index
-// CHECK:             %[[VAL_35:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_36:.*]] = arith.addi %[[VAL_27]], %[[VAL_35]] : index
-// CHECK:             %[[VAL_37:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_34]], %[[VAL_36]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_38:.*]] = fir.load %[[VAL_37]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_38]] : i32
+// CHECK:             %[[VAL_30:.*]] = arith.subi %[[VAL_28]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_31:.*]] = arith.addi %[[VAL_19]], %[[VAL_30]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_32:.*]] = arith.subi %[[VAL_29]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_33:.*]] = arith.addi %[[VAL_27]], %[[VAL_32]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_34:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_31]], %[[VAL_33]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
+// CHECK:             hlfir.yield_element %[[VAL_35]] : i32
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_18]] : !hlfir.expr<?x?xi32>
 // CHECK:         }
 
 func.func @cshift_2d_by_vector(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?x?xi32> {
@@ -92,47 +171,51 @@ func.func @cshift_2d_by_vector(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir
   return %res : !hlfir.expr<?x?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_2d_by_vector(
-// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>>,
-// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?x?xi32> {
-// CHECK:           %[[VAL_26:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_4]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK-SAME:                                   %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?x?xi32>>,
+// CHECK-SAME:                                   %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?x?xi32> {
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
 // CHECK:           %[[VAL_9:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
 // CHECK:           ^bb0(%[[VAL_10:.*]]: index, %[[VAL_11:.*]]: index):
-// CHECK:             %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_15:.*]] = arith.subi %[[VAL_13]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_10]], %[[VAL_15]] : index
-// CHECK:             %[[VAL_17:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
-// CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
-// CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_11]] : (index) -> i64
-// CHECK:             %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_19]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.subi %[[VAL_21]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
-// CHECK:             %[[VAL_24:.*]] = arith.remsi %[[VAL_22]], %[[VAL_23]] : i64
-// CHECK:             %[[VAL_25:.*]] = arith.xori %[[VAL_22]], %[[VAL_23]] : i64
-// CHECK:             %[[VAL_27:.*]] = arith.cmpi slt, %[[VAL_25]], %[[VAL_26]] : i64
-// CHECK:             %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_24]], %[[VAL_26]] : i64
-// CHECK:             %[[VAL_29:.*]] = arith.andi %[[VAL_28]], %[[VAL_27]] : i1
-// CHECK:             %[[VAL_30:.*]] = arith.addi %[[VAL_24]], %[[VAL_23]] : i64
-// CHECK:             %[[VAL_31:.*]] = arith.select %[[VAL_29]], %[[VAL_30]], %[[VAL_24]] : i64
-// CHECK:             %[[VAL_32:.*]] = arith.addi %[[VAL_31]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i64) -> index
-// CHECK:             %[[VAL_35:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_37:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_39:.*]] = arith.subi %[[VAL_35]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_40:.*]] = arith.addi %[[VAL_10]], %[[VAL_39]] : index
-// CHECK:             %[[VAL_41:.*]] = arith.subi %[[VAL_37]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_42:.*]] = arith.addi %[[VAL_33]], %[[VAL_41]] : index
-// CHECK:             %[[VAL_43:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_40]], %[[VAL_42]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_44:.*]] = fir.load %[[VAL_43]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_44]] : i32
+// CHECK:             %[[VAL_12:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_4]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_13:.*]] = arith.subi %[[VAL_12]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_10]], %[[VAL_13]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_15:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_14]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<i32>
+// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i32) -> i64
+// CHECK:             %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_2]] : i64
+// CHECK:             %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_8]] : i64
+// CHECK:             %[[VAL_18:.*]] = arith.remsi %[[VAL_17]], %[[EXTENT]] : i64
+// CHECK:             %[[VAL_19:.*]] = arith.xori %[[VAL_17]], %[[EXTENT]] : i64
+// CHECK:             %[[VAL_20:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_2]] : i64
+// CHECK:             %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_2]] : i64
+// CHECK:             %[[VAL_22:.*]] = arith.andi %[[VAL_21]], %[[VAL_20]] : i1
+// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_18]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_24:.*]] = arith.select %[[VAL_22]], %[[VAL_23]], %[[VAL_18]] : i64
+// CHECK:             %[[VAL_25:.*]] = arith.subi %[[VAL_8]], %[[VAL_24]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_26:.*]] = arith.subi %[[VAL_24]], %[[VAL_8]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_11]] : (index) -> i64
+// CHECK:             %[[VAL_28:.*]] = arith.cmpi sle, %[[VAL_27]], %[[VAL_25]] : i64
+// CHECK:             %[[VAL_29:.*]] = arith.select %[[VAL_28]], %[[VAL_24]], %[[VAL_26]] : i64
+// CHECK:             %[[VAL_30:.*]] = arith.addi %[[VAL_27]], %[[VAL_29]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i64) -> index
+// CHECK:             %[[VAL_32:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_33:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_34:.*]] = arith.subi %[[VAL_32]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_35:.*]] = arith.addi %[[VAL_10]], %[[VAL_34]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_36:.*]] = arith.subi %[[VAL_33]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_37:.*]] = arith.addi %[[VAL_31]], %[[VAL_36]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_38:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_35]], %[[VAL_37]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_39:.*]] = fir.load %[[VAL_38]] : !fir.ref<i32>
+// CHECK:             hlfir.yield_element %[[VAL_39]] : i32
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_9]] : !hlfir.expr<?x?xi32>
 // CHECK:         }
 
 func.func @cshift_vector_char(%arg0: !fir.box<!fir.array<?x!fir.char<2,?>>>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?x!fir.char<2,?>> {
@@ -140,43 +223,47 @@ func.func @cshift_vector_char(%arg0: !fir.box<!fir.array<?x!fir.char<2,?>>>, %ar
   return %res : !hlfir.expr<?x!fir.char<2,?>>
 }
 // CHECK-LABEL:   func.func @cshift_vector_char(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.char<2,?>>>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: !fir.ref<i32>) -> !hlfir.expr<?x!fir.char<2,?>> {
-// CHECK:           %[[VAL_32:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_19:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_6:.*]] = arith.constant 2 : index
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_5:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
-// CHECK:           %[[VAL_7:.*]] = arith.divsi %[[VAL_5]], %[[VAL_6]] : index
-// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
-// CHECK:           %[[VAL_11:.*]] = hlfir.elemental %[[VAL_4]] typeparams %[[VAL_7]] unordered : (!fir.shape<1>, index) -> !hlfir.expr<?x!fir.char<2,?>> {
-// CHECK:           ^bb0(%[[VAL_12:.*]]: index):
-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (index) -> i64
-// CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_10]] : i64
-// CHECK:             %[[VAL_15:.*]] = arith.subi %[[VAL_14]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_16:.*]] = fir.convert %[[VAL_3]]#1 : (index) -> i64
-// CHECK:             %[[VAL_17:.*]] = arith.remsi %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.xori %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_20:.*]] = arith.cmpi slt, %[[VAL_18]], %[[VAL_19]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_19]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.andi %[[VAL_21]], %[[VAL_20]] : i1
-// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_17]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_24:.*]] = arith.select %[[VAL_22]], %[[VAL_23]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i64) -> index
-// CHECK:             %[[VAL_27:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
-// CHECK:             %[[VAL_29:.*]] = arith.divsi %[[VAL_27]], %[[VAL_6]] : index
-// CHECK:             %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_33:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_32]] : index
-// CHECK:             %[[VAL_34:.*]] = arith.addi %[[VAL_26]], %[[VAL_33]] : index
-// CHECK:             %[[VAL_35:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_34]])  typeparams %[[VAL_29]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index, index) -> !fir.boxchar<2>
-// CHECK:             hlfir.yield_element %[[VAL_35]] : !fir.boxchar<2>
+// CHECK-SAME:                                  %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?x!fir.char<2,?>>>,
+// CHECK-SAME:                                  %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32>) -> !hlfir.expr<?x!fir.char<2,?>> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_8:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
+// CHECK:           %[[VAL_9:.*]] = arith.divsi %[[VAL_8]], %[[VAL_4]] : index
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
+// CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_10]], %[[VAL_3]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_10]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.remsi %[[VAL_12]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_14:.*]] = arith.xori %[[VAL_12]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_15:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_16:.*]] = arith.cmpi ne, %[[VAL_13]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_17:.*]] = arith.andi %[[VAL_16]], %[[VAL_15]] : i1
+// CHECK:           %[[VAL_18:.*]] = arith.addi %[[VAL_13]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_19:.*]] = arith.select %[[VAL_17]], %[[VAL_18]], %[[VAL_13]] : i64
+// CHECK:           %[[VAL_20:.*]] = hlfir.elemental %[[VAL_7]] typeparams %[[VAL_9]] unordered : (!fir.shape<1>, index) -> !hlfir.expr<?x!fir.char<2,?>> {
+// CHECK:           ^bb0(%[[VAL_21:.*]]: index):
+// CHECK:             %[[VAL_22:.*]] = arith.subi %[[VAL_10]], %[[VAL_19]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_23:.*]] = arith.subi %[[VAL_19]], %[[VAL_10]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (index) -> i64
+// CHECK:             %[[VAL_25:.*]] = arith.cmpi sle, %[[VAL_24]], %[[VAL_22]] : i64
+// CHECK:             %[[VAL_26:.*]] = arith.select %[[VAL_25]], %[[VAL_19]], %[[VAL_23]] : i64
+// CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_24]], %[[VAL_26]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_28:.*]] = fir.convert %[[VAL_27]] : (i64) -> index
+// CHECK:             %[[VAL_29:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
+// CHECK:             %[[VAL_30:.*]] = arith.divsi %[[VAL_29]], %[[VAL_4]] : index
+// CHECK:             %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_32:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_33:.*]] = arith.addi %[[VAL_28]], %[[VAL_32]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_34:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_33]])  typeparams %[[VAL_30]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index, index) -> !fir.boxchar<2>
+// CHECK:             hlfir.yield_element %[[VAL_34]] : !fir.boxchar<2>
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_20]] : !hlfir.expr<?x!fir.char<2,?>>
 // CHECK:         }
 
 func.func @cshift_vector_poly(%arg0: !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, %arg1: i32) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
@@ -184,37 +271,41 @@ func.func @cshift_vector_poly(%arg0: !fir.class<!fir.array<?x!fir.type<_QFFtestT
   return %res : !hlfir.expr<?x!fir.type<_QFFtestTt>?>
 }
 // CHECK-LABEL:   func.func @cshift_vector_poly(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: i32) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
-// CHECK:           %[[VAL_25:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_15:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
-// CHECK:           %[[VAL_7:.*]] = hlfir.elemental %[[VAL_4]] mold %[[VAL_0]] unordered : (!fir.shape<1>, !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
-// CHECK:           ^bb0(%[[VAL_8:.*]]: index):
-// CHECK:             %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (index) -> i64
-// CHECK:             %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_6]] : i64
-// CHECK:             %[[VAL_11:.*]] = arith.subi %[[VAL_10]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_3]]#1 : (index) -> i64
-// CHECK:             %[[VAL_13:.*]] = arith.remsi %[[VAL_11]], %[[VAL_12]] : i64
-// CHECK:             %[[VAL_14:.*]] = arith.xori %[[VAL_11]], %[[VAL_12]] : i64
-// CHECK:             %[[VAL_16:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_15]] : i64
-// CHECK:             %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_13]], %[[VAL_15]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.andi %[[VAL_17]], %[[VAL_16]] : i1
-// CHECK:             %[[VAL_19:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : i64
-// CHECK:             %[[VAL_20:.*]] = arith.select %[[VAL_18]], %[[VAL_19]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_22:.*]] = fir.convert %[[VAL_21]] : (i64) -> index
-// CHECK:             %[[VAL_24:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_26:.*]] = arith.subi %[[VAL_24]]#0, %[[VAL_25]] : index
-// CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_22]], %[[VAL_26]] : index
+// CHECK-SAME:                                  %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>,
+// CHECK-SAME:                                  %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i32) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]]#1 : (index) -> i64
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_3]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_7]] : i64
+// CHECK:           %[[VAL_9:.*]] = arith.remsi %[[VAL_8]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_10:.*]] = arith.xori %[[VAL_8]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_11:.*]] = arith.cmpi slt, %[[VAL_10]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_12:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.andi %[[VAL_12]], %[[VAL_11]] : i1
+// CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_9]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_15:.*]] = arith.select %[[VAL_13]], %[[VAL_14]], %[[VAL_9]] : i64
+// CHECK:           %[[VAL_16:.*]] = hlfir.elemental %[[VAL_6]] mold %[[VAL_0]] unordered : (!fir.shape<1>, !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
+// CHECK:           ^bb0(%[[VAL_17:.*]]: index):
+// CHECK:             %[[VAL_18:.*]] = arith.subi %[[VAL_7]], %[[VAL_15]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_19:.*]] = arith.subi %[[VAL_15]], %[[VAL_7]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_17]] : (index) -> i64
+// CHECK:             %[[VAL_21:.*]] = arith.cmpi sle, %[[VAL_20]], %[[VAL_18]] : i64
+// CHECK:             %[[VAL_22:.*]] = arith.select %[[VAL_21]], %[[VAL_15]], %[[VAL_19]] : i64
+// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_20]], %[[VAL_22]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i64) -> index
+// CHECK:             %[[VAL_25:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_26:.*]] = arith.subi %[[VAL_25]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_24]], %[[VAL_26]] overflow<nsw, nuw> : index
 // CHECK:             %[[VAL_28:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_27]])  : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> !fir.class<!fir.type<_QFFtestTt>>
 // CHECK:             hlfir.yield_element %[[VAL_28]] : !fir.class<!fir.type<_QFFtestTt>>
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_16]] : !hlfir.expr<?x!fir.type<_QFFtestTt>?>
 // CHECK:         }
 
 // negative: non-constant dim argument
@@ -243,36 +334,13 @@ func.func @cshift_vector_assumed_dim_1(%arg0: !fir.box<!fir.array<?xi32>>, %arg1
   return %res : !hlfir.expr<?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_vector_assumed_dim_1(
-// CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>,
-// CHECK-SAME:                                           %[[VAL_1:.*]]: i32) -> !hlfir.expr<?xi32> {
-// CHECK:           %[[VAL_26:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_16:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
-// CHECK:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_5]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
-// CHECK:           ^bb0(%[[VAL_9:.*]]: index):
-// CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
-// CHECK:             %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_7]] : i64
-// CHECK:             %[[VAL_12:.*]] = arith.subi %[[VAL_11]], %[[VAL_6]] : i64
-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_4]]#1 : (index) -> i64
-// CHECK:             %[[VAL_14:.*]] = arith.remsi %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_15:.*]] = arith.xori %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.cmpi ne, %[[VAL_14]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_19:.*]] = arith.andi %[[VAL_18]], %[[VAL_17]] : i1
-// CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_14]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_20]], %[[VAL_14]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_6]] : i64
-// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i64) -> index
-// CHECK:             %[[VAL_25:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_27:.*]] = arith.subi %[[VAL_25]]#0, %[[VAL_26]] : index
-// CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_23]], %[[VAL_27]] : index
-// CHECK:             %[[VAL_29:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_28]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_30:.*]] = fir.load %[[VAL_29]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_30]] : i32
-// CHECK:           }
-// CHECK:           return
-// CHECK:         }
+// CHECK-NOT: hlfir.cshift
+
+// Check that hlfir.cshift is converted to hlfir.elemental
+// when the argument is an array expression:
+func.func @cshift_vector_expr(%arg0: !hlfir.expr<?xi32>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?xi32>{
+  %res = hlfir.cshift %arg0 %arg1 : (!hlfir.expr<?xi32>, !fir.ref<i32>) -> !hlfir.expr<?xi32>
+  return %res : !hlfir.expr<?xi32>
+}
+// CHECK-LABEL:   func.func @cshift_vector_expr(
+// CHECK: hlfir.elemental