[llvm] [flang-rt] Optimise ShallowCopy and use it in CopyInAssign (PR #140569)
Yusuke MINATO via llvm-commits
llvm-commits at lists.llvm.org
Wed May 21 21:54:05 PDT 2025
================
@@ -114,58 +114,148 @@ RT_API_ATTRS void CheckIntegerKind(
}
}
+template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from) {
- SubscriptValue toAt[maxRank], fromAt[maxRank];
- to.GetLowerBounds(toAt);
- from.GetLowerBounds(fromAt);
+ DescriptorIterator<RANK> toIt{to};
+ DescriptorIterator<RANK> fromIt{from};
+ // Knowing the size at compile time can enable memcpy inlining optimisations
+ constexpr std::size_t typeElementBytes{sizeof(P)};
+ // We might still need to check the actual size as a fallback
std::size_t elementBytes{to.ElementBytes()};
for (std::size_t n{to.Elements()}; n-- > 0;
- to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
- std::memcpy(
- to.Element<char>(toAt), from.Element<char>(fromAt), elementBytes);
+ toIt.Advance(), fromIt.Advance()) {
+ // typeElementBytes == 1 when P is a char - the non-specialised case
+ if constexpr (typeElementBytes != 1) {
+ std::memcpy(
+ toIt.template Get<P>(), fromIt.template Get<P>(), typeElementBytes);
+ } else {
+ std::memcpy(
+ toIt.template Get<P>(), fromIt.template Get<P>(), elementBytes);
+ }
}
}
+template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
const Descriptor &to, const Descriptor &from) {
char *toAt{to.OffsetElement()};
- SubscriptValue fromAt[maxRank];
- from.GetLowerBounds(fromAt);
+ constexpr std::size_t typeElementBytes{sizeof(P)};
std::size_t elementBytes{to.ElementBytes()};
+ DescriptorIterator<RANK> fromIt{from};
for (std::size_t n{to.Elements()}; n-- > 0;
- toAt += elementBytes, from.IncrementSubscripts(fromAt)) {
- std::memcpy(toAt, from.Element<char>(fromAt), elementBytes);
+ toAt += elementBytes, fromIt.Advance()) {
+ if constexpr (typeElementBytes != 1) {
+ std::memcpy(toAt, fromIt.template Get<P>(), typeElementBytes);
+ } else {
+ std::memcpy(toAt, fromIt.template Get<P>(), elementBytes);
+ }
}
}
+template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from) {
- SubscriptValue toAt[maxRank];
- to.GetLowerBounds(toAt);
char *fromAt{from.OffsetElement()};
+ DescriptorIterator<RANK> toIt{to};
+ constexpr std::size_t typeElementBytes{sizeof(P)};
std::size_t elementBytes{to.ElementBytes()};
for (std::size_t n{to.Elements()}; n-- > 0;
- to.IncrementSubscripts(toAt), fromAt += elementBytes) {
- std::memcpy(to.Element<char>(toAt), fromAt, elementBytes);
+ toIt.Advance(), fromAt += elementBytes) {
+ if constexpr (typeElementBytes != 1) {
+ std::memcpy(toIt.template Get<P>(), fromAt, typeElementBytes);
+ } else {
+ std::memcpy(toIt.template Get<P>(), fromAt, elementBytes);
+ }
}
}
-RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
+// ShallowCopy helper for calling the correct specialised variant based on
+// scenario
+template <typename P, int RANK = -1>
+RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
if (toIsContiguous) {
if (fromIsContiguous) {
std::memcpy(to.OffsetElement(), from.OffsetElement(),
to.Elements() * to.ElementBytes());
} else {
- ShallowCopyDiscontiguousToContiguous(to, from);
+ ShallowCopyDiscontiguousToContiguous<P, RANK>(to, from);
}
} else {
if (fromIsContiguous) {
- ShallowCopyContiguousToDiscontiguous(to, from);
+ ShallowCopyContiguousToDiscontiguous<P, RANK>(to, from);
+ } else {
+ ShallowCopyDiscontiguousToDiscontiguous<P, RANK>(to, from);
+ }
+ }
+}
+
+// Most arrays are much closer to rank-1 than to maxRank.
+// Doing the recursion upwards instead of downwards puts the more common
+// cases earlier in the if-chain and has a tangible impact on performance.
+template <typename P, int RANK> struct ShallowCopyRankSpecialize {
+ static bool execute(const Descriptor &to, const Descriptor &from,
+ bool toIsContiguous, bool fromIsContiguous) {
+ if (to.rank() == RANK && from.rank() == RANK) {
+ ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
+ return true;
+ }
+ return ShallowCopyRankSpecialize<P, RANK + 1>::execute(
+ to, from, toIsContiguous, fromIsContiguous);
+ }
+};
+
+template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
+ static bool execute(const Descriptor &to, const Descriptor &from,
+ bool toIsContiguous, bool fromIsContiguous) {
+ return false;
+ }
+};
+
+// ShallowCopy helper for specialising the variants based on array rank
+template <typename P>
+RT_API_ATTRS void ShallowCopyRank(const Descriptor &to, const Descriptor &from,
+ bool toIsContiguous, bool fromIsContiguous) {
+ // Try to call a specialised ShallowCopy variant from rank-1 up to maxRank
+ bool specialized = ShallowCopyRankSpecialize<P, 1>::execute(
+ to, from, toIsContiguous, fromIsContiguous);
----------------
yus3710-fj wrote:
```suggestion
bool specialized{ShallowCopyRankSpecialize<P, 1>::execute(
to, from, toIsContiguous, fromIsContiguous)};
```
https://github.com/llvm/llvm-project/pull/140569
More information about the llvm-commits
mailing list