[llvm] [LoongArch] lower vector shuffle as byte rotate (if possible) (PR #135157)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 14 00:02:47 PDT 2025
================
@@ -696,6 +696,140 @@ static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
}
}
+/// Test whether a shuffle mask is equivalent within each sub-lane.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// suitable for use with existing 128-bit shuffles as entries from the second
+/// vector have been remapped to [LaneSize, 2*LaneSize).
+static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+ RepeatedMask.assign(LaneSize, -1);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ assert(Mask[i] == -1 || Mask[i] >= 0);
+ if (Mask[i] < 0)
+ continue;
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
+
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ // Adjust second vector indices to start at LaneSize instead of Size.
+ int LocalM =
+ Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
+ if (RepeatedMask[i % LaneSize] < 0)
+ // This is the first non-undef entry in this slot of a 128-bit lane.
+ RepeatedMask[i % LaneSize] = LocalM;
+ else if (RepeatedMask[i % LaneSize] != LocalM)
+ // Found a mismatch with the repeated mask.
+ return false;
+ }
+ return true;
+}
+
+/// Attempts to match vector shuffle as byte rotation.
+static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
+
+ SDValue Lo, Hi;
+ SmallVector<int, 16> RepeatedMask;
+
+ if (!isRepeatedShuffleMask(128, VT, Mask, RepeatedMask))
+ return -1;
+
+ int NumElts = RepeatedMask.size();
+ int Rotation = 0;
+ int Scale = 16 / NumElts;
+
+ for (int i = 0; i < NumElts; ++i) {
+ int M = RepeatedMask[i];
+ assert((M == -1 || (0 <= M && M < (2 * NumElts))) &&
+ "Unexpected mask index.");
+ if (M < 0)
+ continue;
+
+ // Determine where a rotated vector would have started.
+ int StartIdx = i - (M % NumElts);
+ if (StartIdx == 0)
+ return -1;
+
+ // If we found the tail of a vector the rotation must be the missing
+ // front. If we found the head of a vector, it must be how much of the
+ // head.
+ // int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
+ int CandidateRotation = (NumElts - StartIdx) % NumElts;
----------------
heiher wrote:
Why bother with this replacement? Since `NumElts` isn't a compile-time constant, the `modulo` might actually be slower than a simple `cmov`.
https://github.com/llvm/llvm-project/pull/135157
More information about the llvm-commits
mailing list