[llvm] [flang-rt] Optimise ShallowCopy and use it in CopyInAssign (PR #140569)
Kajetan Puchalski via llvm-commits
llvm-commits at lists.llvm.org
Thu May 22 05:08:31 PDT 2025
================
@@ -114,58 +114,148 @@ RT_API_ATTRS void CheckIntegerKind(
}
}
+template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from) {
- SubscriptValue toAt[maxRank], fromAt[maxRank];
- to.GetLowerBounds(toAt);
- from.GetLowerBounds(fromAt);
+ DescriptorIterator<RANK> toIt{to};
+ DescriptorIterator<RANK> fromIt{from};
+ // Knowing the size at compile time can enable memcpy inlining optimisations
+ constexpr std::size_t typeElementBytes{sizeof(P)};
+ // We might still need to check the actual size as a fallback
std::size_t elementBytes{to.ElementBytes()};
for (std::size_t n{to.Elements()}; n-- > 0;
- to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
- std::memcpy(
- to.Element<char>(toAt), from.Element<char>(fromAt), elementBytes);
+ toIt.Advance(), fromIt.Advance()) {
+ // typeElementBytes == 1 when P is a char - the non-specialised case
+ if constexpr (typeElementBytes != 1) {
+ std::memcpy(
+ toIt.template Get<P>(), fromIt.template Get<P>(), typeElementBytes);
+ } else {
+ std::memcpy(
+ toIt.template Get<P>(), fromIt.template Get<P>(), elementBytes);
+ }
}
}
+template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
const Descriptor &to, const Descriptor &from) {
char *toAt{to.OffsetElement()};
- SubscriptValue fromAt[maxRank];
- from.GetLowerBounds(fromAt);
+ constexpr std::size_t typeElementBytes{sizeof(P)};
std::size_t elementBytes{to.ElementBytes()};
+ DescriptorIterator<RANK> fromIt{from};
for (std::size_t n{to.Elements()}; n-- > 0;
- toAt += elementBytes, from.IncrementSubscripts(fromAt)) {
- std::memcpy(toAt, from.Element<char>(fromAt), elementBytes);
+ toAt += elementBytes, fromIt.Advance()) {
+ if constexpr (typeElementBytes != 1) {
+ std::memcpy(toAt, fromIt.template Get<P>(), typeElementBytes);
+ } else {
+ std::memcpy(toAt, fromIt.template Get<P>(), elementBytes);
+ }
}
}
+template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from) {
- SubscriptValue toAt[maxRank];
- to.GetLowerBounds(toAt);
char *fromAt{from.OffsetElement()};
+ DescriptorIterator<RANK> toIt{to};
+ constexpr std::size_t typeElementBytes{sizeof(P)};
std::size_t elementBytes{to.ElementBytes()};
for (std::size_t n{to.Elements()}; n-- > 0;
- to.IncrementSubscripts(toAt), fromAt += elementBytes) {
- std::memcpy(to.Element<char>(toAt), fromAt, elementBytes);
+ toIt.Advance(), fromAt += elementBytes) {
+ if constexpr (typeElementBytes != 1) {
+ std::memcpy(toIt.template Get<P>(), fromAt, typeElementBytes);
+ } else {
+ std::memcpy(toIt.template Get<P>(), fromAt, elementBytes);
+ }
}
}
-RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
+// ShallowCopy helper for calling the correct specialised variant based on
+// scenario
+template <typename P, int RANK = -1>
+RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
if (toIsContiguous) {
if (fromIsContiguous) {
std::memcpy(to.OffsetElement(), from.OffsetElement(),
to.Elements() * to.ElementBytes());
} else {
- ShallowCopyDiscontiguousToContiguous(to, from);
+ ShallowCopyDiscontiguousToContiguous<P, RANK>(to, from);
}
} else {
if (fromIsContiguous) {
- ShallowCopyContiguousToDiscontiguous(to, from);
+ ShallowCopyContiguousToDiscontiguous<P, RANK>(to, from);
+ } else {
+ ShallowCopyDiscontiguousToDiscontiguous<P, RANK>(to, from);
+ }
+ }
+}
+
+// Most arrays are much closer to rank-1 than to maxRank.
+// Doing the recursion upwards instead of downwards puts the more common
+// cases earlier in the if-chain and has a tangible impact on performance.
+template <typename P, int RANK> struct ShallowCopyRankSpecialize {
+ static bool execute(const Descriptor &to, const Descriptor &from,
+ bool toIsContiguous, bool fromIsContiguous) {
+ if (to.rank() == RANK && from.rank() == RANK) {
----------------
mrkajetanp wrote:
I don't think it'd make much of a difference in practice, at least. Technically this is just being overly cautious as ShallowCopy is only _supposed_ to be called when both arrays are of the same rank. But I think on balance it's safer to make sure we don't accidentally hit a rank-specialised variant if one of the arrays is actually not of that rank.
https://github.com/llvm/llvm-project/pull/140569
More information about the llvm-commits
mailing list