[llvm] [flang-rt] Optimise ShallowCopy and use it in CopyInAssign (PR #140569)

Yusuke MINATO via llvm-commits llvm-commits at lists.llvm.org
Wed May 21 21:54:05 PDT 2025


================
@@ -114,58 +114,148 @@ RT_API_ATTRS void CheckIntegerKind(
   }
 }
 
+template <typename P, int RANK>
 RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
     const Descriptor &to, const Descriptor &from) {
-  SubscriptValue toAt[maxRank], fromAt[maxRank];
-  to.GetLowerBounds(toAt);
-  from.GetLowerBounds(fromAt);
+  DescriptorIterator<RANK> toIt{to};
+  DescriptorIterator<RANK> fromIt{from};
+  // Knowing the size at compile time can enable memcpy inlining optimisations
+  constexpr std::size_t typeElementBytes{sizeof(P)};
+  // We might still need to check the actual size as a fallback
   std::size_t elementBytes{to.ElementBytes()};
   for (std::size_t n{to.Elements()}; n-- > 0;
-       to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-    std::memcpy(
-        to.Element<char>(toAt), from.Element<char>(fromAt), elementBytes);
+      toIt.Advance(), fromIt.Advance()) {
+    // typeElementBytes == 1 when P is a char - the non-specialised case
+    if constexpr (typeElementBytes != 1) {
+      std::memcpy(
+          toIt.template Get<P>(), fromIt.template Get<P>(), typeElementBytes);
+    } else {
+      std::memcpy(
+          toIt.template Get<P>(), fromIt.template Get<P>(), elementBytes);
+    }
   }
 }
 
+template <typename P, int RANK>
 RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
     const Descriptor &to, const Descriptor &from) {
   char *toAt{to.OffsetElement()};
-  SubscriptValue fromAt[maxRank];
-  from.GetLowerBounds(fromAt);
+  constexpr std::size_t typeElementBytes{sizeof(P)};
   std::size_t elementBytes{to.ElementBytes()};
+  DescriptorIterator<RANK> fromIt{from};
   for (std::size_t n{to.Elements()}; n-- > 0;
-       toAt += elementBytes, from.IncrementSubscripts(fromAt)) {
-    std::memcpy(toAt, from.Element<char>(fromAt), elementBytes);
+      toAt += elementBytes, fromIt.Advance()) {
+    if constexpr (typeElementBytes != 1) {
+      std::memcpy(toAt, fromIt.template Get<P>(), typeElementBytes);
+    } else {
+      std::memcpy(toAt, fromIt.template Get<P>(), elementBytes);
+    }
   }
 }
 
+template <typename P, int RANK>
 RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
     const Descriptor &to, const Descriptor &from) {
-  SubscriptValue toAt[maxRank];
-  to.GetLowerBounds(toAt);
   char *fromAt{from.OffsetElement()};
+  DescriptorIterator<RANK> toIt{to};
+  constexpr std::size_t typeElementBytes{sizeof(P)};
   std::size_t elementBytes{to.ElementBytes()};
   for (std::size_t n{to.Elements()}; n-- > 0;
-       to.IncrementSubscripts(toAt), fromAt += elementBytes) {
-    std::memcpy(to.Element<char>(toAt), fromAt, elementBytes);
+      toIt.Advance(), fromAt += elementBytes) {
+    if constexpr (typeElementBytes != 1) {
+      std::memcpy(toIt.template Get<P>(), fromAt, typeElementBytes);
+    } else {
+      std::memcpy(toIt.template Get<P>(), fromAt, elementBytes);
+    }
   }
 }
 
-RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
+// ShallowCopy helper for calling the correct specialised variant based on
+// scenario
+template <typename P, int RANK = -1>
+RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
     bool toIsContiguous, bool fromIsContiguous) {
   if (toIsContiguous) {
     if (fromIsContiguous) {
       std::memcpy(to.OffsetElement(), from.OffsetElement(),
           to.Elements() * to.ElementBytes());
     } else {
-      ShallowCopyDiscontiguousToContiguous(to, from);
+      ShallowCopyDiscontiguousToContiguous<P, RANK>(to, from);
     }
   } else {
     if (fromIsContiguous) {
-      ShallowCopyContiguousToDiscontiguous(to, from);
+      ShallowCopyContiguousToDiscontiguous<P, RANK>(to, from);
+    } else {
+      ShallowCopyDiscontiguousToDiscontiguous<P, RANK>(to, from);
+    }
+  }
+}
+
+// Most arrays are much closer to rank-1 than to maxRank.
+// Doing the recursion upwards instead of downwards puts the more common
+// cases earlier in the if-chain and has a tangible impact on performance.
+template <typename P, int RANK> struct ShallowCopyRankSpecialize {
+  static bool execute(const Descriptor &to, const Descriptor &from,
+      bool toIsContiguous, bool fromIsContiguous) {
+    if (to.rank() == RANK && from.rank() == RANK) {
+      ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
+      return true;
+    }
+    return ShallowCopyRankSpecialize<P, RANK + 1>::execute(
+        to, from, toIsContiguous, fromIsContiguous);
+  }
+};
+
+template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
+  static bool execute(const Descriptor &to, const Descriptor &from,
+      bool toIsContiguous, bool fromIsContiguous) {
+    return false;
+  }
+};
+
+// ShallowCopy helper for specialising the variants based on array rank
+template <typename P>
+RT_API_ATTRS void ShallowCopyRank(const Descriptor &to, const Descriptor &from,
+    bool toIsContiguous, bool fromIsContiguous) {
+  // Try to call a specialised ShallowCopy variant from rank-1 up to maxRank
+  bool specialized = ShallowCopyRankSpecialize<P, 1>::execute(
+      to, from, toIsContiguous, fromIsContiguous);
----------------
yus3710-fj wrote:

```suggestion
  bool specialized{ShallowCopyRankSpecialize<P, 1>::execute(
      to, from, toIsContiguous, fromIsContiguous)};
```

https://github.com/llvm/llvm-project/pull/140569


More information about the llvm-commits mailing list