[llvm] [flang-rt] Optimise ShallowCopy and use it in CopyInAssign (PR #140569)

Kajetan Puchalski via llvm-commits llvm-commits at lists.llvm.org
Thu May 22 05:08:31 PDT 2025


================
@@ -114,58 +114,148 @@ RT_API_ATTRS void CheckIntegerKind(
   }
 }
 
+template <typename P, int RANK>
 RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
     const Descriptor &to, const Descriptor &from) {
-  SubscriptValue toAt[maxRank], fromAt[maxRank];
-  to.GetLowerBounds(toAt);
-  from.GetLowerBounds(fromAt);
+  DescriptorIterator<RANK> toIt{to};
+  DescriptorIterator<RANK> fromIt{from};
+  // Knowing the size at compile time can enable memcpy inlining optimisations
+  constexpr std::size_t typeElementBytes{sizeof(P)};
+  // We might still need to check the actual size as a fallback
   std::size_t elementBytes{to.ElementBytes()};
   for (std::size_t n{to.Elements()}; n-- > 0;
-       to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-    std::memcpy(
-        to.Element<char>(toAt), from.Element<char>(fromAt), elementBytes);
+      toIt.Advance(), fromIt.Advance()) {
+    // typeElementBytes == 1 when P is a char - the non-specialised case
+    if constexpr (typeElementBytes != 1) {
+      std::memcpy(
+          toIt.template Get<P>(), fromIt.template Get<P>(), typeElementBytes);
+    } else {
+      std::memcpy(
+          toIt.template Get<P>(), fromIt.template Get<P>(), elementBytes);
+    }
   }
 }
 
+template <typename P, int RANK>
 RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
     const Descriptor &to, const Descriptor &from) {
   char *toAt{to.OffsetElement()};
-  SubscriptValue fromAt[maxRank];
-  from.GetLowerBounds(fromAt);
+  constexpr std::size_t typeElementBytes{sizeof(P)};
   std::size_t elementBytes{to.ElementBytes()};
+  DescriptorIterator<RANK> fromIt{from};
   for (std::size_t n{to.Elements()}; n-- > 0;
-       toAt += elementBytes, from.IncrementSubscripts(fromAt)) {
-    std::memcpy(toAt, from.Element<char>(fromAt), elementBytes);
+      toAt += elementBytes, fromIt.Advance()) {
+    if constexpr (typeElementBytes != 1) {
+      std::memcpy(toAt, fromIt.template Get<P>(), typeElementBytes);
+    } else {
+      std::memcpy(toAt, fromIt.template Get<P>(), elementBytes);
+    }
   }
 }
 
+template <typename P, int RANK>
 RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
     const Descriptor &to, const Descriptor &from) {
-  SubscriptValue toAt[maxRank];
-  to.GetLowerBounds(toAt);
   char *fromAt{from.OffsetElement()};
+  DescriptorIterator<RANK> toIt{to};
+  constexpr std::size_t typeElementBytes{sizeof(P)};
   std::size_t elementBytes{to.ElementBytes()};
   for (std::size_t n{to.Elements()}; n-- > 0;
-       to.IncrementSubscripts(toAt), fromAt += elementBytes) {
-    std::memcpy(to.Element<char>(toAt), fromAt, elementBytes);
+      toIt.Advance(), fromAt += elementBytes) {
+    if constexpr (typeElementBytes != 1) {
+      std::memcpy(toIt.template Get<P>(), fromAt, typeElementBytes);
+    } else {
+      std::memcpy(toIt.template Get<P>(), fromAt, elementBytes);
+    }
   }
 }
 
-RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
+// ShallowCopy helper for calling the correct specialised variant based on
+// scenario
+template <typename P, int RANK = -1>
+RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
     bool toIsContiguous, bool fromIsContiguous) {
   if (toIsContiguous) {
     if (fromIsContiguous) {
       std::memcpy(to.OffsetElement(), from.OffsetElement(),
           to.Elements() * to.ElementBytes());
     } else {
-      ShallowCopyDiscontiguousToContiguous(to, from);
+      ShallowCopyDiscontiguousToContiguous<P, RANK>(to, from);
     }
   } else {
     if (fromIsContiguous) {
-      ShallowCopyContiguousToDiscontiguous(to, from);
+      ShallowCopyContiguousToDiscontiguous<P, RANK>(to, from);
+    } else {
+      ShallowCopyDiscontiguousToDiscontiguous<P, RANK>(to, from);
+    }
+  }
+}
+
+// Most arrays are much closer to rank-1 than to maxRank.
+// Doing the recursion upwards instead of downwards puts the more common
+// cases earlier in the if-chain and has a tangible impact on performance.
+template <typename P, int RANK> struct ShallowCopyRankSpecialize {
+  static bool execute(const Descriptor &to, const Descriptor &from,
+      bool toIsContiguous, bool fromIsContiguous) {
+    if (to.rank() == RANK && from.rank() == RANK) {
----------------
mrkajetanp wrote:

I don't think it'd make much of a difference in practice, at least. Technically this is just being overly cautious as ShallowCopy is only _supposed_ to be called when both arrays are of the same rank. But I think on balance it's safer to make sure we don't accidentally hit a rank-specialised variant if one of the arrays is actually not of that rank.

https://github.com/llvm/llvm-project/pull/140569


More information about the llvm-commits mailing list