[llvm] [X86][Codegen] Shuffle certain shifts on i8 vectors to create opportunity for vectorized shift instructions (PR #117980)

Thu Nov 28 00:56:35 PST 2024

https://github.com/huangjd created https://github.com/llvm/llvm-project/pull/117980

Vectorized shift instructions are not available for i8 type. The current typical way to handle a shift on i8 vector is to use 2 vector i16 multiply to get the even and odd bytes separately and then combine them. If shift amount is a constant vector and we can somehow shuffle the constant vector so that each pair or quad of adjacent elements has the same value, we can obtain the result by using vector shift on widened type and then a vector  AND to clear the bits supposed to be shifted out of a byte. This is typically faster than using vector multiply, as long as the shuffle itself is also fast (because we need to shuffle the operand before and after back to its original order).

>From a398aae3289b13f7ed3c5e610b16c595febc0ca9 Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Wed, 20 Nov 2024 22:51:53 -0500
Subject: [PATCH 1/3] initial commit - for vxi8 shifts, try permute vector to
 widen shift

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 200 ++++++++++++++++++++++++
 1 file changed, 200 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1c790f3813b7a4..9bfa8cd6610cd5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29766,6 +29766,102 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
   return SDValue();
 }
 
+// Given a vector of values, find a permutation such that every adjacent even-
+// odd pair has the same value. ~0 is reserved as a special value for wildcard,
+// which can be paired with any value. Returns true if a permutation is found.
+template <typename InputTy,
+         typename PermutationTy,
+         typename MapTy = std::unordered_map<typename InputTy::value_type,
+                                         std::pair<typename InputTy::value_type, typename PermutationTy::value_type>>>
+static bool PermuteAndPairVector(const InputTy& Inputs,
+                                 PermutationTy &Permutation) {
+  const auto Wildcard = ~typename InputTy::value_type();
+
+  // List of values to be paired, mapping an unpaired value to its current
+  // neighbor's value and index.
+  MapTy UnpairedInputs;
+  SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
+
+  Permutation.clear();
+  typename PermutationTy::value_type I = 0;
+  for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); InputIt != InputEnd;) {
+    Permutation.push_back(I);
+    Permutation.push_back(I + 1);
+
+    auto Even = *InputIt++;
+    assert(InputIt != InputEnd && "Expected even number of elements");
+    auto Odd = *InputIt++;
+
+    // If both are wildcards, note it for later use by unpairable values.
+    if (Even == Wildcard && Odd == Wildcard) {
+      WildcardPairs.push_back(I);
+    }
+
+    // If both are equal, they are in good position.
+    if (Even != Odd) {
+      auto DoWork = [&] (auto &This, auto ThisIndex, auto Other, auto OtherIndex) {
+        if (This != Wildcard) {
+          // For non-wildcard value, check if it can pair with an exisiting
+          // unpaired value from UnpairedInputs, if so, swap with the unpaired
+          // value's neighbor, otherwise the current value is added to the map.
+          if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(This, std::make_pair(Other, OtherIndex)); !Inserted) {
+            auto [SwapValue, SwapIndex] = MapIt->second;
+            std::swap(Permutation[SwapIndex], Permutation[ThisIndex]);
+            This = SwapValue;
+            UnpairedInputs.erase(MapIt);
+
+            if (This == Other) {
+              if (This == Wildcard) {
+                // We freed up a wildcard pair by pairing two non-adjacent
+                // values, note it for later use by unpairable values.
+                WildcardPairs.push_back(I);
+              } else {
+                // The swapped element also forms a pair with Other, so it can
+                // be removed from the map.
+                assert(UnpairedInputs.count(This));
+                UnpairedInputs.erase(This);
+              }
+            } else {
+              // Swapped in an unpaired value, update its info.
+              if (This != Wildcard) {
+                assert(UnpairedInputs.count(This));
+                UnpairedInputs[This] = std::make_pair(Other, OtherIndex);
+              }
+              // If its neighbor is also in UnpairedInputs, update its info too.
+              if (auto OtherMapIt = UnpairedInputs.find(Other); OtherMapIt != UnpairedInputs.end() && OtherMapIt->second.second == ThisIndex) {
+                OtherMapIt->second.first = This;
+              }
+            }
+          }
+        }
+      };
+      DoWork(Even, I, Odd, I + 1);
+      if (Even != Odd) {
+        DoWork(Odd, I + 1, Even, I);
+      }
+    }
+    I += 2;
+  }
+
+  // Now check if each remaining unpaired neighboring values can be swapped with
+  // a wildcard pair to form two paired values.
+  for (auto &[Unpaired, V] : UnpairedInputs) {
+    auto [Neighbor, NeighborIndex]  = V;
+    if (Neighbor != Wildcard) {
+      assert(UnpairedInputs.count(Neighbor));
+      if (WildcardPairs.size()) {
+        std::swap(Permutation[WildcardPairs.back()], Permutation[NeighborIndex]);
+        WildcardPairs.pop_back();
+        // Mark the neighbor as processed.
+        UnpairedInputs[Neighbor].first = Wildcard;
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
@@ -30044,6 +30140,110 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
+  // ISD::SRA/SRL/SHL on vXi8 can be widened to vYi16 (Y = X/2) if the constant
+  // amounts can be shuffled such that every pair of adjacent elements has the
+  // same value. This introduces an extra shuffle before and after the shift,
+  // and it is profitable if the operand is aready a shuffle so that both can
+  // be merged, or if the extra shuffle is fast (can use VPSHUFB).
+  // (shift (shuffle X P1) S1) ->
+  // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
+  // widened, and P2^-1 is the inverse shuffle of P2.
+  if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse() && Subtarget.hasSSE3()) {
+    bool Profitable = true;
+    // VPAND ymm only available on AVX2.
+    if (VT == MVT::v32i8 || VT == MVT::v64i8) {
+      Profitable = Subtarget.hasAVX2();
+    }
+
+    SmallVector<int, 64> Permutation;
+    SmallVector<uint16_t, 64> ShiftAmt;
+    for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
+      if (Amt.getOperand(I).isUndef())
+        ShiftAmt.push_back(~0);
+      else
+        ShiftAmt.push_back(Amt.getConstantOperandVal(I));
+    }
+
+    if (Profitable && (VT == MVT::v32i8 || VT == MVT::v64i8)) {
+      Profitable = false;
+      constexpr size_t LaneBytes = 16;
+      const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
+
+      // For v32i8 or v64i8, we should check if we can generate a shuffle that
+      // may be lowered to VPSHUFB, because it is faster than VPERMB. This is
+      // possible if we can apply the same shuffle mask to each v16i8 lane.
+      // For example (assuming a lane has 4 elements for simplicity),
+      // <1, 2, 2, 1, 4, 3, 3, 4> is handled as <14, 23, 23, 14>, which can
+      // be shuffled to adjacent pairs <14, 14, 23, 23> with the VPSHUFB mask
+      // <0, 3, 2, 1> (or high level mask <0, 3, 2, 1, 4, 7, 6, 5>).
+      // Limitation: if there are some undef in shift amounts, this algorithm
+      // may not find a solution even if one exists, as here we only treat a
+      // VPSHUFB index as undef if all shuffle amounts of the same index modulo
+      // lane size are all undef.
+      // Since a byte can only be shifted by 7 bits without being UB, 4 bits are
+      // enough to represent the shift amount or undef (0xF).
+      std::array<uint16_t, LaneBytes> VPSHUFBShiftAmt = {};
+      for (size_t I = 0; I < LaneBytes; ++I)
+        for (size_t J = 0; J < NumLanes; ++J)
+          VPSHUFBShiftAmt[I] |= (ShiftAmt[I + J * LaneBytes] & 0xF) << (J * 4);
+      if (VT == MVT::v32i8) {
+        for (size_t I = 0; I < LaneBytes; ++I)
+          VPSHUFBShiftAmt[I] |= 0xFF00;
+      }
+      if (PermuteAndPairVector(VPSHUFBShiftAmt, Permutation)) {
+        // Found a VPSHUFB solution, offset the shuffle amount to other lanes.
+        Permutation.resize(VT.getVectorNumElements());
+        for (size_t I = 0; I < LaneBytes; ++I)
+          for (size_t J = 1; J < NumLanes; ++J)
+            Permutation[I + J * LaneBytes] = Permutation[I] + J * LaneBytes;
+        Profitable = true;
+      } else if (R.getOpcode() == ISD::VECTOR_SHUFFLE) {
+        // A slower shuffle is profitable if the operand is also a slow shuffle,
+        // such that they can be merged.
+        // TODO: Use TargetTransformInfo to systematically determine whether
+        // inner shuffle is slow. Currently we only check if it contains
+        // cross-lane shuffle.
+        if (ShuffleVectorSDNode *InnerShuffle = dyn_cast<ShuffleVectorSDNode>(R.getNode())) {
+          if (InnerShuffle->getMask().size() == VT.getVectorNumElements() &&
+              is128BitLaneCrossingShuffleMask(VT, InnerShuffle->getMask()))
+            Profitable = true;
+        }
+      }
+    }
+
+    // If it is still profitable at this point, and has not found a permutation
+    // yet, try again with any shuffle index.
+    if (Profitable && Permutation.empty()) {
+      PermuteAndPairVector<decltype(ShiftAmt), decltype(Permutation),
+                           SmallMapVector<uint16_t, std::pair<uint16_t, int>, 8>>(ShiftAmt, Permutation);
+    }
+
+    // Found a permutation P that can rearrange the shift amouts into adjacent
+    // pair of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
+    if (!Permutation.empty()) {
+      SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
+      SmallVector<SDValue, 64> NewShiftAmt;
+      for (int Index : Permutation) {
+        NewShiftAmt.push_back(Amt.getOperand(Index));
+      }
+#ifndef NDEBUG
+      for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
+        SDValue Even = NewShiftAmt[I];
+        SDValue Odd = NewShiftAmt[I + 1];
+        assert(Even.isUndef() || Odd.isUndef() || Even->getAsZExtVal() == Odd->getAsZExtVal());
+      }
+#endif
+      SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
+      SDValue NewShift = DAG.getNode(Opc, dl, VT, InnerShuffle, NewShiftVector);
+      SmallVector<int, 64> InversePermutation(Permutation.size());
+      for (size_t I = 0; I < Permutation.size(); ++I) {
+        InversePermutation[Permutation[I]] = I;
+      }
+      SDValue OuterShuffle = DAG.getVectorShuffle(VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
+      return OuterShuffle;
+    }
+  }
+
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
   // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.

>From 0a0f4805c0bd86186914001e1ec8419fe58945b2 Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Thu, 28 Nov 2024 01:47:21 -0500
Subject: [PATCH 2/3] Second version: more cpu latency measurement with
 llvm-mca

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 174 +++++++++++++-----------
 1 file changed, 98 insertions(+), 76 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9bfa8cd6610cd5..aba3968c29eb9b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29769,24 +29769,27 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
 // Given a vector of values, find a permutation such that every adjacent even-
 // odd pair has the same value. ~0 is reserved as a special value for wildcard,
 // which can be paired with any value. Returns true if a permutation is found.
+// If output Permutation is not empty, permutation index starts at its previous
+// size, so that this function can concatenate the result of multiple calls.
+// UnpairedInputs contains values yet to be paired, mapping an unpaired value to
+// its current neighbor's value and index.
+// Do not use llvm::DenseMap as ~0 is reserved key.
 template <typename InputTy,
          typename PermutationTy,
-         typename MapTy = std::unordered_map<typename InputTy::value_type,
-                                         std::pair<typename InputTy::value_type, typename PermutationTy::value_type>>>
+         typename MapTy = SmallMapVector<typename InputTy::value_type,
+                                         std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>>
 static bool PermuteAndPairVector(const InputTy& Inputs,
-                                 PermutationTy &Permutation) {
+                                 PermutationTy &Permutation,
+                                 MapTy UnpairedInputs = SmallMapVector<typename InputTy::value_type,
+                                                                       std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>()) {
   const auto Wildcard = ~typename InputTy::value_type();
-
-  // List of values to be paired, mapping an unpaired value to its current
-  // neighbor's value and index.
-  MapTy UnpairedInputs;
   SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
 
-  Permutation.clear();
+  size_t OutputOffset = Permutation.size();
   typename PermutationTy::value_type I = 0;
   for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); InputIt != InputEnd;) {
-    Permutation.push_back(I);
-    Permutation.push_back(I + 1);
+    Permutation.push_back(OutputOffset + I);
+    Permutation.push_back(OutputOffset + I + 1);
 
     auto Even = *InputIt++;
     assert(InputIt != InputEnd && "Expected even number of elements");
@@ -29806,7 +29809,7 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
           // value's neighbor, otherwise the current value is added to the map.
           if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(This, std::make_pair(Other, OtherIndex)); !Inserted) {
             auto [SwapValue, SwapIndex] = MapIt->second;
-            std::swap(Permutation[SwapIndex], Permutation[ThisIndex]);
+            std::swap(Permutation[OutputOffset + SwapIndex], Permutation[OutputOffset + ThisIndex]);
             This = SwapValue;
             UnpairedInputs.erase(MapIt);
 
@@ -29850,13 +29853,12 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
     if (Neighbor != Wildcard) {
       assert(UnpairedInputs.count(Neighbor));
       if (WildcardPairs.size()) {
-        std::swap(Permutation[WildcardPairs.back()], Permutation[NeighborIndex]);
+        std::swap(Permutation[OutputOffset + WildcardPairs.back()], Permutation[OutputOffset + NeighborIndex]);
         WildcardPairs.pop_back();
         // Mark the neighbor as processed.
         UnpairedInputs[Neighbor].first = Wildcard;
-      } else {
+      } else
         return false;
-      }
     }
   }
   return true;
@@ -30140,23 +30142,22 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
-  // ISD::SRA/SRL/SHL on vXi8 can be widened to vYi16 (Y = X/2) if the constant
-  // amounts can be shuffled such that every pair of adjacent elements has the
-  // same value. This introduces an extra shuffle before and after the shift,
-  // and it is profitable if the operand is aready a shuffle so that both can
-  // be merged, or if the extra shuffle is fast (can use VPSHUFB).
+  // SHL/SRL/SRA on vXi8 can be widened to vYi16 or vYi32 if the constant
+  // amounts can be shuffled such that every pair or quad of adjacent elements
+  // has the same value. This introduces an extra shuffle before and after the
+  // shift, and it is profitable if the operand is aready a shuffle so that both
+  // can be merged and the extra shuffle is fast. This is not profitable on
+  // AVX512 becasue it has 16-bit vector variable shift instruction VPS**VW.
   // (shift (shuffle X P1) S1) ->
   // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
   // widened, and P2^-1 is the inverse shuffle of P2.
-  if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse() && Subtarget.hasSSE3()) {
-    bool Profitable = true;
-    // VPAND ymm only available on AVX2.
-    if (VT == MVT::v32i8 || VT == MVT::v64i8) {
-      Profitable = Subtarget.hasAVX2();
-    }
+  if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse()
+      && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
+    constexpr size_t LaneBytes = 16;
+    const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
 
     SmallVector<int, 64> Permutation;
-    SmallVector<uint16_t, 64> ShiftAmt;
+    SmallVector<uint8_t, 64> ShiftAmt;
     for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
       if (Amt.getOperand(I).isUndef())
         ShiftAmt.push_back(~0);
@@ -30164,63 +30165,84 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         ShiftAmt.push_back(Amt.getConstantOperandVal(I));
     }
 
-    if (Profitable && (VT == MVT::v32i8 || VT == MVT::v64i8)) {
-      Profitable = false;
-      constexpr size_t LaneBytes = 16;
-      const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
-
-      // For v32i8 or v64i8, we should check if we can generate a shuffle that
-      // may be lowered to VPSHUFB, because it is faster than VPERMB. This is
-      // possible if we can apply the same shuffle mask to each v16i8 lane.
-      // For example (assuming a lane has 4 elements for simplicity),
-      // <1, 2, 2, 1, 4, 3, 3, 4> is handled as <14, 23, 23, 14>, which can
-      // be shuffled to adjacent pairs <14, 14, 23, 23> with the VPSHUFB mask
-      // <0, 3, 2, 1> (or high level mask <0, 3, 2, 1, 4, 7, 6, 5>).
-      // Limitation: if there are some undef in shift amounts, this algorithm
-      // may not find a solution even if one exists, as here we only treat a
-      // VPSHUFB index as undef if all shuffle amounts of the same index modulo
-      // lane size are all undef.
-      // Since a byte can only be shifted by 7 bits without being UB, 4 bits are
-      // enough to represent the shift amount or undef (0xF).
-      std::array<uint16_t, LaneBytes> VPSHUFBShiftAmt = {};
-      for (size_t I = 0; I < LaneBytes; ++I)
-        for (size_t J = 0; J < NumLanes; ++J)
-          VPSHUFBShiftAmt[I] |= (ShiftAmt[I + J * LaneBytes] & 0xF) << (J * 4);
-      if (VT == MVT::v32i8) {
-        for (size_t I = 0; I < LaneBytes; ++I)
-          VPSHUFBShiftAmt[I] |= 0xFF00;
-      }
-      if (PermuteAndPairVector(VPSHUFBShiftAmt, Permutation)) {
-        // Found a VPSHUFB solution, offset the shuffle amount to other lanes.
-        Permutation.resize(VT.getVectorNumElements());
-        for (size_t I = 0; I < LaneBytes; ++I)
-          for (size_t J = 1; J < NumLanes; ++J)
-            Permutation[I + J * LaneBytes] = Permutation[I] + J * LaneBytes;
-        Profitable = true;
-      } else if (R.getOpcode() == ISD::VECTOR_SHUFFLE) {
-        // A slower shuffle is profitable if the operand is also a slow shuffle,
-        // such that they can be merged.
-        // TODO: Use TargetTransformInfo to systematically determine whether
-        // inner shuffle is slow. Currently we only check if it contains
-        // cross-lane shuffle.
-        if (ShuffleVectorSDNode *InnerShuffle = dyn_cast<ShuffleVectorSDNode>(R.getNode())) {
-          if (InnerShuffle->getMask().size() == VT.getVectorNumElements() &&
-              is128BitLaneCrossingShuffleMask(VT, InnerShuffle->getMask()))
-            Profitable = true;
+    // Check if we can find an in-lane shuffle to rearrange the shift amounts,
+    // if so, this transformation may be profitable.
+    bool Profitable;
+    for (size_t I = 0; I < NumLanes; ++I) {
+      if (!(Profitable = PermuteAndPairVector(ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
+        break;
+    }
+
+    // For AVX2, check if we can further rearrange shift amounts into adjacent
+    // quads, so that it can use VPS*LVD instead of VPMUL*W as it is 2 cycles
+    // faster.
+    bool IsAdjacentQuads = false;
+    if (Profitable && Subtarget.hasAVX2()) {
+      SmallVector<uint8_t, 64> EveryOtherShiftAmt;
+      for (size_t I = 0; I < Permutation.size(); I += 2) {
+        uint8_t Shift1 = ShiftAmt[Permutation[I]];
+        uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
+        assert(Shift1 == Shift2 || ~Shift1 == 0 || ~Shift2 == 0);
+        EveryOtherShiftAmt.push_back(~Shift1 ? Shift1 : Shift2);
+      }
+      SmallVector<int, 32> Permutation2;
+      for (size_t I = 0; I < NumLanes; ++I) {
+        if (!(IsAdjacentQuads = PermuteAndPairVector(ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2], LaneBytes / 2), Permutation2)))
+          break;
+      }
+      if (IsAdjacentQuads) {
+        SmallVector<int, 64> CombinedPermutation;
+        for (int Index : Permutation2) {
+          CombinedPermutation.push_back(Permutation[Index * 2]);
+          CombinedPermutation.push_back(Permutation[Index * 2 + 1]);
         }
+        std::swap(Permutation, CombinedPermutation);
       }
     }
 
-    // If it is still profitable at this point, and has not found a permutation
-    // yet, try again with any shuffle index.
-    if (Profitable && Permutation.empty()) {
-      PermuteAndPairVector<decltype(ShiftAmt), decltype(Permutation),
-                           SmallMapVector<uint16_t, std::pair<uint16_t, int>, 8>>(ShiftAmt, Permutation);
+    // For right shifts, (V)PMULHUW needs an extra instruction to handle an
+    // amount of 0, disabling the transformation here to be cautious.
+    if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
+        any_of(ShiftAmt, [](auto x) { return x == 0; }))
+      Profitable = false;
+
+    bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
+    // If operand R is not a shuffle by itself, the transformation here adds two
+    // shuffles, adding a non-trivial cost. Here we take out a few cases where
+    // the benefit is questionable according to llvm-mca's modeling.
+    //
+    // Each cell shows latency before/after transform. Here R is not a shuffle.
+    // SSE3
+    //      | v16i8 | v32i8 | v64i8
+    // ----------------------------
+    // SLL  | 17/17 | 20/20 | 26/26
+    // SRL  | 18/17 | 22/20 | 35/26
+    // SRA  | 21/19 | 26/22 | 39/30
+    // AVX2 using VPMUL*W
+    //      | v16i8 | v32i8 | v64i8
+    // ----------------------------
+    // SLL  | 20/18 | 18/18 | 21/21
+    // SRL  | 20/18 | 22/18 | 26/21
+    // SRA  | 20/20 | 22/20 | 25/23
+    // AVX2 using VPS*LVD
+    //      | v16i8 | v32i8 | v64i8
+    // ----------------------------
+    // SLL  | 20/16 | 18/16 | 21/20
+    // SRL  | 20/16 | 22/16 | 26/20
+    // SRA  | 20/18 | 22/18 | 25/22
+    if (!IsOperandShuffle) {
+      if (Subtarget.hasAVX2()) {
+        if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
+          Profitable = false;
+      } else {
+        if (Opc == ISD::SHL || ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
+          Profitable = false;
+      }
     }
 
     // Found a permutation P that can rearrange the shift amouts into adjacent
-    // pair of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
-    if (!Permutation.empty()) {
+    // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
+    if (Profitable) {
       SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
       SmallVector<SDValue, 64> NewShiftAmt;
       for (int Index : Permutation) {

>From 02249f3c811568e31e78b9290bb2189a089bc5ae Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Thu, 28 Nov 2024 03:39:13 -0500
Subject: [PATCH 3/3] format

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 55 ++++++++++++++++---------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aba3968c29eb9b..90d7be73c62126 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29774,20 +29774,22 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
 // UnpairedInputs contains values yet to be paired, mapping an unpaired value to
 // its current neighbor's value and index.
 // Do not use llvm::DenseMap as ~0 is reserved key.
-template <typename InputTy,
-         typename PermutationTy,
-         typename MapTy = SmallMapVector<typename InputTy::value_type,
-                                         std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>>
-static bool PermuteAndPairVector(const InputTy& Inputs,
-                                 PermutationTy &Permutation,
-                                 MapTy UnpairedInputs = SmallMapVector<typename InputTy::value_type,
-                                                                       std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>()) {
+template <typename InputTy, typename PermutationTy,
+          typename MapTy =
+              SmallMapVector<typename InputTy::value_type,
+                             std::pair<typename InputTy::value_type,
+                                       typename PermutationTy::value_type>,
+                             8>>
+static bool PermuteAndPairVector(
+    const InputTy &Inputs, PermutationTy &Permutation,
+    MapTy UnpairedInputs = MapTy()) {
   const auto Wildcard = ~typename InputTy::value_type();
   SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
 
   size_t OutputOffset = Permutation.size();
   typename PermutationTy::value_type I = 0;
-  for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); InputIt != InputEnd;) {
+  for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end();
+       InputIt != InputEnd;) {
     Permutation.push_back(OutputOffset + I);
     Permutation.push_back(OutputOffset + I + 1);
 
@@ -29802,14 +29804,18 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
 
     // If both are equal, they are in good position.
     if (Even != Odd) {
-      auto DoWork = [&] (auto &This, auto ThisIndex, auto Other, auto OtherIndex) {
+      auto DoWork = [&](auto &This, auto ThisIndex, auto Other,
+                        auto OtherIndex) {
         if (This != Wildcard) {
           // For non-wildcard value, check if it can pair with an exisiting
           // unpaired value from UnpairedInputs, if so, swap with the unpaired
           // value's neighbor, otherwise the current value is added to the map.
-          if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(This, std::make_pair(Other, OtherIndex)); !Inserted) {
+          if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(
+                  This, std::make_pair(Other, OtherIndex));
+              !Inserted) {
             auto [SwapValue, SwapIndex] = MapIt->second;
-            std::swap(Permutation[OutputOffset + SwapIndex], Permutation[OutputOffset + ThisIndex]);
+            std::swap(Permutation[OutputOffset + SwapIndex],
+                      Permutation[OutputOffset + ThisIndex]);
             This = SwapValue;
             UnpairedInputs.erase(MapIt);
 
@@ -29831,7 +29837,9 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
                 UnpairedInputs[This] = std::make_pair(Other, OtherIndex);
               }
               // If its neighbor is also in UnpairedInputs, update its info too.
-              if (auto OtherMapIt = UnpairedInputs.find(Other); OtherMapIt != UnpairedInputs.end() && OtherMapIt->second.second == ThisIndex) {
+              if (auto OtherMapIt = UnpairedInputs.find(Other);
+                  OtherMapIt != UnpairedInputs.end() &&
+                  OtherMapIt->second.second == ThisIndex) {
                 OtherMapIt->second.first = This;
               }
             }
@@ -29849,11 +29857,12 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
   // Now check if each remaining unpaired neighboring values can be swapped with
   // a wildcard pair to form two paired values.
   for (auto &[Unpaired, V] : UnpairedInputs) {
-    auto [Neighbor, NeighborIndex]  = V;
+    auto [Neighbor, NeighborIndex] = V;
     if (Neighbor != Wildcard) {
       assert(UnpairedInputs.count(Neighbor));
       if (WildcardPairs.size()) {
-        std::swap(Permutation[OutputOffset + WildcardPairs.back()], Permutation[OutputOffset + NeighborIndex]);
+        std::swap(Permutation[OutputOffset + WildcardPairs.back()],
+                  Permutation[OutputOffset + NeighborIndex]);
         WildcardPairs.pop_back();
         // Mark the neighbor as processed.
         UnpairedInputs[Neighbor].first = Wildcard;
@@ -30151,8 +30160,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   // (shift (shuffle X P1) S1) ->
   // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
   // widened, and P2^-1 is the inverse shuffle of P2.
-  if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse()
-      && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
+  if (ConstantAmt &&
+      (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
+      R.hasOneUse() && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
     constexpr size_t LaneBytes = 16;
     const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
 
@@ -30169,7 +30179,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     // if so, this transformation may be profitable.
     bool Profitable;
     for (size_t I = 0; I < NumLanes; ++I) {
-      if (!(Profitable = PermuteAndPairVector(ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
+      if (!(Profitable = PermuteAndPairVector(
+                ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
         break;
     }
 
@@ -30187,7 +30198,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       }
       SmallVector<int, 32> Permutation2;
       for (size_t I = 0; I < NumLanes; ++I) {
-        if (!(IsAdjacentQuads = PermuteAndPairVector(ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2], LaneBytes / 2), Permutation2)))
+        if (!(IsAdjacentQuads = PermuteAndPairVector(
+                  ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2],
+                           LaneBytes / 2),
+                  Permutation2)))
           break;
       }
       if (IsAdjacentQuads) {
@@ -30235,7 +30249,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
           Profitable = false;
       } else {
-        if (Opc == ISD::SHL || ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
+        if (Opc == ISD::SHL ||
+            ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
           Profitable = false;
       }
     }