[llvm-branch-commits] [llvm] [LoongArch] Use xvperm.w for cross-lane access within a single vector (PR #151634)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Sep 2 00:08:02 PDT 2025
================
@@ -1832,6 +1832,48 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
}
+/// Lower VECTOR_SHUFFLE into XVPERM (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ // LoongArch LASX only have XVPERM_W.
+ if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32))
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfSize = NumElts / 2;
+ bool FrontLo = true, FrontHi = true;
+ bool BackLo = true, BackHi = true;
+
+ auto inRange = [](int val, int low, int high) {
+ return (val == -1) || (val >= low && val < high);
+ };
+
+ for (unsigned i = 0; i < HalfSize; ++i) {
+ int Fronti = Mask[i];
+ int Backi = Mask[i + HalfSize];
+
+ FrontLo &= inRange(Fronti, 0, HalfSize);
+ FrontHi &= inRange(Fronti, HalfSize, NumElts);
+ BackLo &= inRange(Backi, 0, HalfSize);
+ BackHi &= inRange(Backi, HalfSize, NumElts);
+ }
+
+ // If both the lower and upper 128-bit parts access only one half of the
+ // vector (either lower or upper), avoid using xvperm.w. The latency of
+ // xvperm.w(3) is higher than using xvshuf(1) and xvori(1).
----------------
heiher wrote:
For a shuffle that swaps the upper and lower 128-bit halves, `xvperm.w` alone should be enough and likely faster.
```llvm
define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
%shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 0, i32 1>
ret <8 x i32> %shuffle
}
```
https://github.com/llvm/llvm-project/pull/151634
More information about the llvm-branch-commits
mailing list