[llvm] [RISCV][CG]Use processShuffleMasks for per-register shuffles (PR #121765)
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 7 12:30:03 PST 2025
================
@@ -5121,58 +5120,102 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
MVT ElemVT = VT.getVectorElementType();
unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
- unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
-
- SmallVector<std::pair<int, SmallVector<int>>>
- OutMasks(VRegsPerSrc, {-1, {}});
-
- // Check if our mask can be done as a 1-to-1 mapping from source
- // to destination registers in the group without needing to
- // write each destination more than once.
- for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) {
- int DstVecIdx = DstIdx / ElemsPerVReg;
- int DstSubIdx = DstIdx % ElemsPerVReg;
- int SrcIdx = Mask[DstIdx];
- if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts)
- continue;
- int SrcVecIdx = SrcIdx / ElemsPerVReg;
- int SrcSubIdx = SrcIdx % ElemsPerVReg;
- if (OutMasks[DstVecIdx].first == -1)
- OutMasks[DstVecIdx].first = SrcVecIdx;
- if (OutMasks[DstVecIdx].first != SrcVecIdx)
- // Note: This case could easily be handled by keeping track of a chain
- // of source values and generating two element shuffles below. This is
- // less an implementation question, and more a profitability one.
- return SDValue();
-
- OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1);
- OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx;
- }
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
assert(M1VT == getLMUL1VT(M1VT));
unsigned NumOpElts = M1VT.getVectorMinNumElements();
- SDValue Vec = DAG.getUNDEF(ContainerVT);
+ unsigned NormalizedVF = ContainerVT.getVectorMinNumElements();
+ unsigned NumOfSrcRegs = NormalizedVF / NumOpElts;
+ unsigned NumOfDestRegs = NormalizedVF / NumOpElts;
// The following semantically builds up a fixed length concat_vector
// of the component shuffle_vectors. We eagerly lower to scalable here
// to avoid DAG combining it back to a large shuffle_vector again.
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
- for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) {
- auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx];
- if (SrcVecIdx == -1)
- continue;
- unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
- SDValue SrcVec = (unsigned)SrcVecIdx >= VRegsPerSrc ? V2 : V1;
+ SmallVector<SmallVector<std::tuple<unsigned, unsigned, SmallVector<int>>>>
+ Operands;
+ processShuffleMasks(
+ Mask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs,
+ [&]() { Operands.emplace_back(); },
+ [&](ArrayRef<int> SrcSubMask, unsigned SrcVecIdx, unsigned DstVecIdx) {
+ Operands.emplace_back().emplace_back(
+ SrcVecIdx, UINT_MAX,
+ SmallVector<int>(SrcSubMask.begin(), SrcSubMask.end()));
+ },
+ [&](ArrayRef<int> SrcSubMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
+ if (NewReg)
+ Operands.emplace_back();
+ Operands.back().emplace_back(
+ Idx1, Idx2, SmallVector<int>(SrcSubMask.begin(), SrcSubMask.end()));
+ });
+ assert(Operands.size() == NumOfDestRegs && "Whole vector must be processed");
+ // Note: check that we do not emit too many shuffles here to prevent code
+ // size explosion.
+ // TODO: investigate, if it can be improved by extra analysis of the masks to
+ // check if the code is more profitable.
+ unsigned NumShuffles = std::accumulate(
+ Operands.begin(), Operands.end(), 0u,
+ [&](unsigned N,
+ ArrayRef<std::tuple<unsigned, unsigned, SmallVector<int>>> Data) {
+ if (Data.empty())
+ return N;
+ N += Data.size();
+ for (const auto &P : Data) {
+ unsigned Idx2 = std::get<1>(P);
+ ArrayRef<int> Mask = std::get<2>(P);
+ if (Idx2 != UINT_MAX)
+ ++N;
+ else if (ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
+ --N;
+ }
+ return N;
+ });
+ if ((NumOfDestRegs > 2 && NumShuffles > NumOfDestRegs) ||
+ (NumOfDestRegs <= 2 && NumShuffles >= 2 * NumOfDestRegs))
+ return SDValue();
+ auto ExtractValue = [&, &DAG = DAG](SDValue SrcVec, unsigned ExtractIdx) {
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
DAG.getVectorIdxConstant(ExtractIdx, DL));
SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
- SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
- SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
- unsigned InsertIdx = DstVecIdx * NumOpElts;
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec,
+ return SubVec;
+ };
+ auto PerformShuffle = [&, &DAG = DAG](SDValue SubVec1, SDValue SubVec2,
+ ArrayRef<int> Mask) {
+ SDValue SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec1, SubVec2, Mask);
+ return SubVec;
+ };
+ SDValue Vec = DAG.getUNDEF(ContainerVT);
+ for (auto [I, Data] : enumerate(Operands)) {
+ if (Data.empty())
+ continue;
+ SmallDenseMap<unsigned, SDValue, 4> Values;
+ for (unsigned I : seq<unsigned>(Data.size())) {
+ const auto &[Idx1, Idx2, _] = Data[I];
+ if (Values.contains(Idx1)) {
+ assert(Idx2 != UINT_MAX && Values.contains(Idx2) &&
+ "Expected both indices to be extracted already.");
+ break;
+ }
+ SDValue V = ExtractValue(Idx1 >= NumOfSrcRegs ? V2 : V1,
+ (Idx1 % NumOfSrcRegs) * NumOpElts);
+ Values[Idx1] = V;
+ if (Idx2 != UINT_MAX)
+ Values[Idx2] = ExtractValue(Idx2 >= NumOfSrcRegs ? V2 : V1,
+ (Idx2 % NumOfSrcRegs) * NumOpElts);
+ }
+ SDValue V;
+ for (const auto &[Idx1, Idx2, Mask] : Data) {
+ SDValue V1 = Values.at(Idx1);
+ SDValue V2 = Idx2 == UINT_MAX ? V1 : Values.at(Idx2);
+ V = PerformShuffle(V1, V2, Mask);
+ Values[Idx1] = V;
----------------
preames wrote:
It seems weird to overwrite the prior value for the source? Is this what is needed by this API? Or is this supposed to be a separate destination namespace?
https://github.com/llvm/llvm-project/pull/121765
More information about the llvm-commits
mailing list