[llvm] [X86][Codegen] Shuffle certain shifts on i8 vectors to create opportunity for vectorized shift instructions (PR #117980)

William Huang via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 19 21:06:55 PST 2024


https://github.com/huangjd updated https://github.com/llvm/llvm-project/pull/117980

>From a398aae3289b13f7ed3c5e610b16c595febc0ca9 Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Wed, 20 Nov 2024 22:51:53 -0500
Subject: [PATCH 1/8] initial commit - for vxi8 shifts, try permute vector to
 widen shift

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 200 ++++++++++++++++++++++++
 1 file changed, 200 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1c790f3813b7a4..9bfa8cd6610cd5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29766,6 +29766,102 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
   return SDValue();
 }
 
+// Given a vector of values, find a permutation such that every adjacent even-
+// odd pair has the same value. ~0 is reserved as a special value for wildcard,
+// which can be paired with any value. Returns true if a permutation is found.
+template <typename InputTy,
+         typename PermutationTy,
+         typename MapTy = std::unordered_map<typename InputTy::value_type,
+                                         std::pair<typename InputTy::value_type, typename PermutationTy::value_type>>>
+static bool PermuteAndPairVector(const InputTy& Inputs,
+                                 PermutationTy &Permutation) {
+  const auto Wildcard = ~typename InputTy::value_type();
+
+  // List of values to be paired, mapping an unpaired value to its current
+  // neighbor's value and index.
+  MapTy UnpairedInputs;
+  SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
+
+  Permutation.clear();
+  typename PermutationTy::value_type I = 0;
+  for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); InputIt != InputEnd;) {
+    Permutation.push_back(I);
+    Permutation.push_back(I + 1);
+
+    auto Even = *InputIt++;
+    assert(InputIt != InputEnd && "Expected even number of elements");
+    auto Odd = *InputIt++;
+
+    // If both are wildcards, note it for later use by unpairable values.
+    if (Even == Wildcard && Odd == Wildcard) {
+      WildcardPairs.push_back(I);
+    }
+
+    // If both are equal, they are in good position.
+    if (Even != Odd) {
+      auto DoWork = [&] (auto &This, auto ThisIndex, auto Other, auto OtherIndex) {
+        if (This != Wildcard) {
+          // For non-wildcard value, check if it can pair with an exisiting
+          // unpaired value from UnpairedInputs, if so, swap with the unpaired
+          // value's neighbor, otherwise the current value is added to the map.
+          if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(This, std::make_pair(Other, OtherIndex)); !Inserted) {
+            auto [SwapValue, SwapIndex] = MapIt->second;
+            std::swap(Permutation[SwapIndex], Permutation[ThisIndex]);
+            This = SwapValue;
+            UnpairedInputs.erase(MapIt);
+
+            if (This == Other) {
+              if (This == Wildcard) {
+                // We freed up a wildcard pair by pairing two non-adjacent
+                // values, note it for later use by unpairable values.
+                WildcardPairs.push_back(I);
+              } else {
+                // The swapped element also forms a pair with Other, so it can
+                // be removed from the map.
+                assert(UnpairedInputs.count(This));
+                UnpairedInputs.erase(This);
+              }
+            } else {
+              // Swapped in an unpaired value, update its info.
+              if (This != Wildcard) {
+                assert(UnpairedInputs.count(This));
+                UnpairedInputs[This] = std::make_pair(Other, OtherIndex);
+              }
+              // If its neighbor is also in UnpairedInputs, update its info too.
+              if (auto OtherMapIt = UnpairedInputs.find(Other); OtherMapIt != UnpairedInputs.end() && OtherMapIt->second.second == ThisIndex) {
+                OtherMapIt->second.first = This;
+              }
+            }
+          }
+        }
+      };
+      DoWork(Even, I, Odd, I + 1);
+      if (Even != Odd) {
+        DoWork(Odd, I + 1, Even, I);
+      }
+    }
+    I += 2;
+  }
+
+  // Now check if each remaining unpaired neighboring values can be swapped with
+  // a wildcard pair to form two paired values.
+  for (auto &[Unpaired, V] : UnpairedInputs) {
+    auto [Neighbor, NeighborIndex]  = V;
+    if (Neighbor != Wildcard) {
+      assert(UnpairedInputs.count(Neighbor));
+      if (WildcardPairs.size()) {
+        std::swap(Permutation[WildcardPairs.back()], Permutation[NeighborIndex]);
+        WildcardPairs.pop_back();
+        // Mark the neighbor as processed.
+        UnpairedInputs[Neighbor].first = Wildcard;
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
@@ -30044,6 +30140,110 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
+  // ISD::SRA/SRL/SHL on vXi8 can be widened to vYi16 (Y = X/2) if the constant
+  // amounts can be shuffled such that every pair of adjacent elements has the
+  // same value. This introduces an extra shuffle before and after the shift,
+  // and it is profitable if the operand is aready a shuffle so that both can
+  // be merged, or if the extra shuffle is fast (can use VPSHUFB).
+  // (shift (shuffle X P1) S1) ->
+  // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
+  // widened, and P2^-1 is the inverse shuffle of P2.
+  if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse() && Subtarget.hasSSE3()) {
+    bool Profitable = true;
+    // VPAND ymm only available on AVX2.
+    if (VT == MVT::v32i8 || VT == MVT::v64i8) {
+      Profitable = Subtarget.hasAVX2();
+    }
+
+    SmallVector<int, 64> Permutation;
+    SmallVector<uint16_t, 64> ShiftAmt;
+    for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
+      if (Amt.getOperand(I).isUndef())
+        ShiftAmt.push_back(~0);
+      else
+        ShiftAmt.push_back(Amt.getConstantOperandVal(I));
+    }
+
+    if (Profitable && (VT == MVT::v32i8 || VT == MVT::v64i8)) {
+      Profitable = false;
+      constexpr size_t LaneBytes = 16;
+      const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
+
+      // For v32i8 or v64i8, we should check if we can generate a shuffle that
+      // may be lowered to VPSHUFB, because it is faster than VPERMB. This is
+      // possible if we can apply the same shuffle mask to each v16i8 lane.
+      // For example (assuming a lane has 4 elements for simplicity),
+      // <1, 2, 2, 1, 4, 3, 3, 4> is handled as <14, 23, 23, 14>, which can
+      // be shuffled to adjacent pairs <14, 14, 23, 23> with the VPSHUFB mask
+      // <0, 3, 2, 1> (or high level mask <0, 3, 2, 1, 4, 7, 6, 5>).
+      // Limitation: if there are some undef in shift amounts, this algorithm
+      // may not find a solution even if one exists, as here we only treat a
+      // VPSHUFB index as undef if all shuffle amounts of the same index modulo
+      // lane size are all undef.
+      // Since a byte can only be shifted by 7 bits without being UB, 4 bits are
+      // enough to represent the shift amount or undef (0xF).
+      std::array<uint16_t, LaneBytes> VPSHUFBShiftAmt = {};
+      for (size_t I = 0; I < LaneBytes; ++I)
+        for (size_t J = 0; J < NumLanes; ++J)
+          VPSHUFBShiftAmt[I] |= (ShiftAmt[I + J * LaneBytes] & 0xF) << (J * 4);
+      if (VT == MVT::v32i8) {
+        for (size_t I = 0; I < LaneBytes; ++I)
+          VPSHUFBShiftAmt[I] |= 0xFF00;
+      }
+      if (PermuteAndPairVector(VPSHUFBShiftAmt, Permutation)) {
+        // Found a VPSHUFB solution, offset the shuffle amount to other lanes.
+        Permutation.resize(VT.getVectorNumElements());
+        for (size_t I = 0; I < LaneBytes; ++I)
+          for (size_t J = 1; J < NumLanes; ++J)
+            Permutation[I + J * LaneBytes] = Permutation[I] + J * LaneBytes;
+        Profitable = true;
+      } else if (R.getOpcode() == ISD::VECTOR_SHUFFLE) {
+        // A slower shuffle is profitable if the operand is also a slow shuffle,
+        // such that they can be merged.
+        // TODO: Use TargetTransformInfo to systematically determine whether
+        // inner shuffle is slow. Currently we only check if it contains
+        // cross-lane shuffle.
+        if (ShuffleVectorSDNode *InnerShuffle = dyn_cast<ShuffleVectorSDNode>(R.getNode())) {
+          if (InnerShuffle->getMask().size() == VT.getVectorNumElements() &&
+              is128BitLaneCrossingShuffleMask(VT, InnerShuffle->getMask()))
+            Profitable = true;
+        }
+      }
+    }
+
+    // If it is still profitable at this point, and has not found a permutation
+    // yet, try again with any shuffle index.
+    if (Profitable && Permutation.empty()) {
+      PermuteAndPairVector<decltype(ShiftAmt), decltype(Permutation),
+                           SmallMapVector<uint16_t, std::pair<uint16_t, int>, 8>>(ShiftAmt, Permutation);
+    }
+
+    // Found a permutation P that can rearrange the shift amouts into adjacent
+    // pair of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
+    if (!Permutation.empty()) {
+      SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
+      SmallVector<SDValue, 64> NewShiftAmt;
+      for (int Index : Permutation) {
+        NewShiftAmt.push_back(Amt.getOperand(Index));
+      }
+#ifndef NDEBUG
+      for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
+        SDValue Even = NewShiftAmt[I];
+        SDValue Odd = NewShiftAmt[I + 1];
+        assert(Even.isUndef() || Odd.isUndef() || Even->getAsZExtVal() == Odd->getAsZExtVal());
+      }
+#endif
+      SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
+      SDValue NewShift = DAG.getNode(Opc, dl, VT, InnerShuffle, NewShiftVector);
+      SmallVector<int, 64> InversePermutation(Permutation.size());
+      for (size_t I = 0; I < Permutation.size(); ++I) {
+        InversePermutation[Permutation[I]] = I;
+      }
+      SDValue OuterShuffle = DAG.getVectorShuffle(VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
+      return OuterShuffle;
+    }
+  }
+
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
   // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.

>From 0a0f4805c0bd86186914001e1ec8419fe58945b2 Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Thu, 28 Nov 2024 01:47:21 -0500
Subject: [PATCH 2/8] Second version: more cpu latency measurement with
 llvm-mca

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 174 +++++++++++++-----------
 1 file changed, 98 insertions(+), 76 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9bfa8cd6610cd5..aba3968c29eb9b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29769,24 +29769,27 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
 // Given a vector of values, find a permutation such that every adjacent even-
 // odd pair has the same value. ~0 is reserved as a special value for wildcard,
 // which can be paired with any value. Returns true if a permutation is found.
+// If output Permutation is not empty, permutation index starts at its previous
+// size, so that this function can concatenate the result of multiple calls.
+// UnpairedInputs contains values yet to be paired, mapping an unpaired value to
+// its current neighbor's value and index.
+// Do not use llvm::DenseMap as ~0 is reserved key.
 template <typename InputTy,
          typename PermutationTy,
-         typename MapTy = std::unordered_map<typename InputTy::value_type,
-                                         std::pair<typename InputTy::value_type, typename PermutationTy::value_type>>>
+         typename MapTy = SmallMapVector<typename InputTy::value_type,
+                                         std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>>
 static bool PermuteAndPairVector(const InputTy& Inputs,
-                                 PermutationTy &Permutation) {
+                                 PermutationTy &Permutation,
+                                 MapTy UnpairedInputs = SmallMapVector<typename InputTy::value_type,
+                                                                       std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>()) {
   const auto Wildcard = ~typename InputTy::value_type();
-
-  // List of values to be paired, mapping an unpaired value to its current
-  // neighbor's value and index.
-  MapTy UnpairedInputs;
   SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
 
-  Permutation.clear();
+  size_t OutputOffset = Permutation.size();
   typename PermutationTy::value_type I = 0;
   for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); InputIt != InputEnd;) {
-    Permutation.push_back(I);
-    Permutation.push_back(I + 1);
+    Permutation.push_back(OutputOffset + I);
+    Permutation.push_back(OutputOffset + I + 1);
 
     auto Even = *InputIt++;
     assert(InputIt != InputEnd && "Expected even number of elements");
@@ -29806,7 +29809,7 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
           // value's neighbor, otherwise the current value is added to the map.
           if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(This, std::make_pair(Other, OtherIndex)); !Inserted) {
             auto [SwapValue, SwapIndex] = MapIt->second;
-            std::swap(Permutation[SwapIndex], Permutation[ThisIndex]);
+            std::swap(Permutation[OutputOffset + SwapIndex], Permutation[OutputOffset + ThisIndex]);
             This = SwapValue;
             UnpairedInputs.erase(MapIt);
 
@@ -29850,13 +29853,12 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
     if (Neighbor != Wildcard) {
       assert(UnpairedInputs.count(Neighbor));
       if (WildcardPairs.size()) {
-        std::swap(Permutation[WildcardPairs.back()], Permutation[NeighborIndex]);
+        std::swap(Permutation[OutputOffset + WildcardPairs.back()], Permutation[OutputOffset + NeighborIndex]);
         WildcardPairs.pop_back();
         // Mark the neighbor as processed.
         UnpairedInputs[Neighbor].first = Wildcard;
-      } else {
+      } else
         return false;
-      }
     }
   }
   return true;
@@ -30140,23 +30142,22 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
-  // ISD::SRA/SRL/SHL on vXi8 can be widened to vYi16 (Y = X/2) if the constant
-  // amounts can be shuffled such that every pair of adjacent elements has the
-  // same value. This introduces an extra shuffle before and after the shift,
-  // and it is profitable if the operand is aready a shuffle so that both can
-  // be merged, or if the extra shuffle is fast (can use VPSHUFB).
+  // SHL/SRL/SRA on vXi8 can be widened to vYi16 or vYi32 if the constant
+  // amounts can be shuffled such that every pair or quad of adjacent elements
+  // has the same value. This introduces an extra shuffle before and after the
+  // shift, and it is profitable if the operand is aready a shuffle so that both
+  // can be merged and the extra shuffle is fast. This is not profitable on
+  // AVX512 becasue it has 16-bit vector variable shift instruction VPS**VW.
   // (shift (shuffle X P1) S1) ->
   // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
   // widened, and P2^-1 is the inverse shuffle of P2.
-  if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse() && Subtarget.hasSSE3()) {
-    bool Profitable = true;
-    // VPAND ymm only available on AVX2.
-    if (VT == MVT::v32i8 || VT == MVT::v64i8) {
-      Profitable = Subtarget.hasAVX2();
-    }
+  if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse()
+      && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
+    constexpr size_t LaneBytes = 16;
+    const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
 
     SmallVector<int, 64> Permutation;
-    SmallVector<uint16_t, 64> ShiftAmt;
+    SmallVector<uint8_t, 64> ShiftAmt;
     for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
       if (Amt.getOperand(I).isUndef())
         ShiftAmt.push_back(~0);
@@ -30164,63 +30165,84 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         ShiftAmt.push_back(Amt.getConstantOperandVal(I));
     }
 
-    if (Profitable && (VT == MVT::v32i8 || VT == MVT::v64i8)) {
-      Profitable = false;
-      constexpr size_t LaneBytes = 16;
-      const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
-
-      // For v32i8 or v64i8, we should check if we can generate a shuffle that
-      // may be lowered to VPSHUFB, because it is faster than VPERMB. This is
-      // possible if we can apply the same shuffle mask to each v16i8 lane.
-      // For example (assuming a lane has 4 elements for simplicity),
-      // <1, 2, 2, 1, 4, 3, 3, 4> is handled as <14, 23, 23, 14>, which can
-      // be shuffled to adjacent pairs <14, 14, 23, 23> with the VPSHUFB mask
-      // <0, 3, 2, 1> (or high level mask <0, 3, 2, 1, 4, 7, 6, 5>).
-      // Limitation: if there are some undef in shift amounts, this algorithm
-      // may not find a solution even if one exists, as here we only treat a
-      // VPSHUFB index as undef if all shuffle amounts of the same index modulo
-      // lane size are all undef.
-      // Since a byte can only be shifted by 7 bits without being UB, 4 bits are
-      // enough to represent the shift amount or undef (0xF).
-      std::array<uint16_t, LaneBytes> VPSHUFBShiftAmt = {};
-      for (size_t I = 0; I < LaneBytes; ++I)
-        for (size_t J = 0; J < NumLanes; ++J)
-          VPSHUFBShiftAmt[I] |= (ShiftAmt[I + J * LaneBytes] & 0xF) << (J * 4);
-      if (VT == MVT::v32i8) {
-        for (size_t I = 0; I < LaneBytes; ++I)
-          VPSHUFBShiftAmt[I] |= 0xFF00;
-      }
-      if (PermuteAndPairVector(VPSHUFBShiftAmt, Permutation)) {
-        // Found a VPSHUFB solution, offset the shuffle amount to other lanes.
-        Permutation.resize(VT.getVectorNumElements());
-        for (size_t I = 0; I < LaneBytes; ++I)
-          for (size_t J = 1; J < NumLanes; ++J)
-            Permutation[I + J * LaneBytes] = Permutation[I] + J * LaneBytes;
-        Profitable = true;
-      } else if (R.getOpcode() == ISD::VECTOR_SHUFFLE) {
-        // A slower shuffle is profitable if the operand is also a slow shuffle,
-        // such that they can be merged.
-        // TODO: Use TargetTransformInfo to systematically determine whether
-        // inner shuffle is slow. Currently we only check if it contains
-        // cross-lane shuffle.
-        if (ShuffleVectorSDNode *InnerShuffle = dyn_cast<ShuffleVectorSDNode>(R.getNode())) {
-          if (InnerShuffle->getMask().size() == VT.getVectorNumElements() &&
-              is128BitLaneCrossingShuffleMask(VT, InnerShuffle->getMask()))
-            Profitable = true;
+    // Check if we can find an in-lane shuffle to rearrange the shift amounts,
+    // if so, this transformation may be profitable.
+    bool Profitable;
+    for (size_t I = 0; I < NumLanes; ++I) {
+      if (!(Profitable = PermuteAndPairVector(ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
+        break;
+    }
+
+    // For AVX2, check if we can further rearrange shift amounts into adjacent
+    // quads, so that it can use VPS*LVD instead of VPMUL*W as it is 2 cycles
+    // faster.
+    bool IsAdjacentQuads = false;
+    if (Profitable && Subtarget.hasAVX2()) {
+      SmallVector<uint8_t, 64> EveryOtherShiftAmt;
+      for (size_t I = 0; I < Permutation.size(); I += 2) {
+        uint8_t Shift1 = ShiftAmt[Permutation[I]];
+        uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
+        assert(Shift1 == Shift2 || ~Shift1 == 0 || ~Shift2 == 0);
+        EveryOtherShiftAmt.push_back(~Shift1 ? Shift1 : Shift2);
+      }
+      SmallVector<int, 32> Permutation2;
+      for (size_t I = 0; I < NumLanes; ++I) {
+        if (!(IsAdjacentQuads = PermuteAndPairVector(ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2], LaneBytes / 2), Permutation2)))
+          break;
+      }
+      if (IsAdjacentQuads) {
+        SmallVector<int, 64> CombinedPermutation;
+        for (int Index : Permutation2) {
+          CombinedPermutation.push_back(Permutation[Index * 2]);
+          CombinedPermutation.push_back(Permutation[Index * 2 + 1]);
         }
+        std::swap(Permutation, CombinedPermutation);
       }
     }
 
-    // If it is still profitable at this point, and has not found a permutation
-    // yet, try again with any shuffle index.
-    if (Profitable && Permutation.empty()) {
-      PermuteAndPairVector<decltype(ShiftAmt), decltype(Permutation),
-                           SmallMapVector<uint16_t, std::pair<uint16_t, int>, 8>>(ShiftAmt, Permutation);
+    // For right shifts, (V)PMULHUW needs an extra instruction to handle an
+    // amount of 0, disabling the transformation here to be cautious.
+    if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
+        any_of(ShiftAmt, [](auto x) { return x == 0; }))
+      Profitable = false;
+
+    bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
+    // If operand R is not a shuffle by itself, the transformation here adds two
+    // shuffles, adding a non-trivial cost. Here we take out a few cases where
+    // the benefit is questionable according to llvm-mca's modeling.
+    //
+    // Each cell shows latency before/after transform. Here R is not a shuffle.
+    // SSE3
+    //      | v16i8 | v32i8 | v64i8
+    // ----------------------------
+    // SLL  | 17/17 | 20/20 | 26/26
+    // SRL  | 18/17 | 22/20 | 35/26
+    // SRA  | 21/19 | 26/22 | 39/30
+    // AVX2 using VPMUL*W
+    //      | v16i8 | v32i8 | v64i8
+    // ----------------------------
+    // SLL  | 20/18 | 18/18 | 21/21
+    // SRL  | 20/18 | 22/18 | 26/21
+    // SRA  | 20/20 | 22/20 | 25/23
+    // AVX2 using VPS*LVD
+    //      | v16i8 | v32i8 | v64i8
+    // ----------------------------
+    // SLL  | 20/16 | 18/16 | 21/20
+    // SRL  | 20/16 | 22/16 | 26/20
+    // SRA  | 20/18 | 22/18 | 25/22
+    if (!IsOperandShuffle) {
+      if (Subtarget.hasAVX2()) {
+        if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
+          Profitable = false;
+      } else {
+        if (Opc == ISD::SHL || ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
+          Profitable = false;
+      }
     }
 
     // Found a permutation P that can rearrange the shift amouts into adjacent
-    // pair of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
-    if (!Permutation.empty()) {
+    // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
+    if (Profitable) {
       SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
       SmallVector<SDValue, 64> NewShiftAmt;
       for (int Index : Permutation) {

>From 02249f3c811568e31e78b9290bb2189a089bc5ae Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Thu, 28 Nov 2024 03:39:13 -0500
Subject: [PATCH 3/8] format

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 55 ++++++++++++++++---------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aba3968c29eb9b..90d7be73c62126 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29774,20 +29774,22 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
 // UnpairedInputs contains values yet to be paired, mapping an unpaired value to
 // its current neighbor's value and index.
 // Do not use llvm::DenseMap as ~0 is reserved key.
-template <typename InputTy,
-         typename PermutationTy,
-         typename MapTy = SmallMapVector<typename InputTy::value_type,
-                                         std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>>
-static bool PermuteAndPairVector(const InputTy& Inputs,
-                                 PermutationTy &Permutation,
-                                 MapTy UnpairedInputs = SmallMapVector<typename InputTy::value_type,
-                                                                       std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>()) {
+template <typename InputTy, typename PermutationTy,
+          typename MapTy =
+              SmallMapVector<typename InputTy::value_type,
+                             std::pair<typename InputTy::value_type,
+                                       typename PermutationTy::value_type>,
+                             8>>
+static bool PermuteAndPairVector(
+    const InputTy &Inputs, PermutationTy &Permutation,
+    MapTy UnpairedInputs = MapTy()) {
   const auto Wildcard = ~typename InputTy::value_type();
   SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
 
   size_t OutputOffset = Permutation.size();
   typename PermutationTy::value_type I = 0;
-  for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); InputIt != InputEnd;) {
+  for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end();
+       InputIt != InputEnd;) {
     Permutation.push_back(OutputOffset + I);
     Permutation.push_back(OutputOffset + I + 1);
 
@@ -29802,14 +29804,18 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
 
     // If both are equal, they are in good position.
     if (Even != Odd) {
-      auto DoWork = [&] (auto &This, auto ThisIndex, auto Other, auto OtherIndex) {
+      auto DoWork = [&](auto &This, auto ThisIndex, auto Other,
+                        auto OtherIndex) {
         if (This != Wildcard) {
           // For non-wildcard value, check if it can pair with an exisiting
           // unpaired value from UnpairedInputs, if so, swap with the unpaired
           // value's neighbor, otherwise the current value is added to the map.
-          if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(This, std::make_pair(Other, OtherIndex)); !Inserted) {
+          if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(
+                  This, std::make_pair(Other, OtherIndex));
+              !Inserted) {
             auto [SwapValue, SwapIndex] = MapIt->second;
-            std::swap(Permutation[OutputOffset + SwapIndex], Permutation[OutputOffset + ThisIndex]);
+            std::swap(Permutation[OutputOffset + SwapIndex],
+                      Permutation[OutputOffset + ThisIndex]);
             This = SwapValue;
             UnpairedInputs.erase(MapIt);
 
@@ -29831,7 +29837,9 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
                 UnpairedInputs[This] = std::make_pair(Other, OtherIndex);
               }
               // If its neighbor is also in UnpairedInputs, update its info too.
-              if (auto OtherMapIt = UnpairedInputs.find(Other); OtherMapIt != UnpairedInputs.end() && OtherMapIt->second.second == ThisIndex) {
+              if (auto OtherMapIt = UnpairedInputs.find(Other);
+                  OtherMapIt != UnpairedInputs.end() &&
+                  OtherMapIt->second.second == ThisIndex) {
                 OtherMapIt->second.first = This;
               }
             }
@@ -29849,11 +29857,12 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
   // Now check if each remaining unpaired neighboring values can be swapped with
   // a wildcard pair to form two paired values.
   for (auto &[Unpaired, V] : UnpairedInputs) {
-    auto [Neighbor, NeighborIndex]  = V;
+    auto [Neighbor, NeighborIndex] = V;
     if (Neighbor != Wildcard) {
       assert(UnpairedInputs.count(Neighbor));
       if (WildcardPairs.size()) {
-        std::swap(Permutation[OutputOffset + WildcardPairs.back()], Permutation[OutputOffset + NeighborIndex]);
+        std::swap(Permutation[OutputOffset + WildcardPairs.back()],
+                  Permutation[OutputOffset + NeighborIndex]);
         WildcardPairs.pop_back();
         // Mark the neighbor as processed.
         UnpairedInputs[Neighbor].first = Wildcard;
@@ -30151,8 +30160,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   // (shift (shuffle X P1) S1) ->
   // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
   // widened, and P2^-1 is the inverse shuffle of P2.
-  if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse()
-      && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
+  if (ConstantAmt &&
+      (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
+      R.hasOneUse() && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
     constexpr size_t LaneBytes = 16;
     const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
 
@@ -30169,7 +30179,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     // if so, this transformation may be profitable.
     bool Profitable;
     for (size_t I = 0; I < NumLanes; ++I) {
-      if (!(Profitable = PermuteAndPairVector(ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
+      if (!(Profitable = PermuteAndPairVector(
+                ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
         break;
     }
 
@@ -30187,7 +30198,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       }
       SmallVector<int, 32> Permutation2;
       for (size_t I = 0; I < NumLanes; ++I) {
-        if (!(IsAdjacentQuads = PermuteAndPairVector(ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2], LaneBytes / 2), Permutation2)))
+        if (!(IsAdjacentQuads = PermuteAndPairVector(
+                  ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2],
+                           LaneBytes / 2),
+                  Permutation2)))
           break;
       }
       if (IsAdjacentQuads) {
@@ -30235,7 +30249,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
           Profitable = false;
       } else {
-        if (Opc == ISD::SHL || ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
+        if (Opc == ISD::SHL ||
+            ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
           Profitable = false;
       }
     }

>From 0bdbc64b8a09393c1e5e3dddc385550f4b95dea0 Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Fri, 13 Dec 2024 17:14:51 -0500
Subject: [PATCH 4/8] bug fixes

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 73 +++++++++++--------------
 1 file changed, 32 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 90d7be73c62126..879f98708894cb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,7 +28,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -29782,8 +29781,8 @@ template <typename InputTy, typename PermutationTy,
                              8>>
 static bool PermuteAndPairVector(
     const InputTy &Inputs, PermutationTy &Permutation,
-    MapTy UnpairedInputs = MapTy()) {
-  const auto Wildcard = ~typename InputTy::value_type();
+    MapTy UnpairedInputs = MapTy()) {static_assert(std::is_same<typename InputTy::value_type, uint8_t>::value);
+  const typename InputTy::value_type Wildcard = ~0;
   SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
 
   size_t OutputOffset = Permutation.size();
@@ -30155,14 +30154,16 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   // amounts can be shuffled such that every pair or quad of adjacent elements
   // has the same value. This introduces an extra shuffle before and after the
   // shift, and it is profitable if the operand is aready a shuffle so that both
-  // can be merged and the extra shuffle is fast. This is not profitable on
-  // AVX512 becasue it has 16-bit vector variable shift instruction VPS**VW.
+  // can be merged or the extra shuffle is fast.
   // (shift (shuffle X P1) S1) ->
   // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
   // widened, and P2^-1 is the inverse shuffle of P2.
+  // This is not profitable on XOP or AVX512 becasue it has 8/16-bit vector
+  // variable shift instructions.
   if (ConstantAmt &&
       (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
-      R.hasOneUse() && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
+      R.hasOneUse() && Subtarget.hasSSSE3() && !Subtarget.hasAVX512() &&
+      !Subtarget.hasXOP()) {
     constexpr size_t LaneBytes = 16;
     const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
 
@@ -30176,7 +30177,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     }
 
     // Check if we can find an in-lane shuffle to rearrange the shift amounts,
-    // if so, this transformation may be profitable.
+    // if so, this transformation may be profitable. Cross-lane shuffle is
+    // almost never profitable because there is no general 1-instruction
+    // solution.
     bool Profitable;
     for (size_t I = 0; I < NumLanes; ++I) {
       if (!(Profitable = PermuteAndPairVector(
@@ -30193,8 +30196,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       for (size_t I = 0; I < Permutation.size(); I += 2) {
         uint8_t Shift1 = ShiftAmt[Permutation[I]];
         uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
-        assert(Shift1 == Shift2 || ~Shift1 == 0 || ~Shift2 == 0);
-        EveryOtherShiftAmt.push_back(~Shift1 ? Shift1 : Shift2);
+        assert(Shift1 == Shift2 || Shift1 == (uint8_t) ~0 ||
+               Shift2 == (uint8_t) ~0);
+        EveryOtherShiftAmt.push_back(Shift1 != (uint8_t) ~0 ? Shift1 : Shift2);
       }
       SmallVector<int, 32> Permutation2;
       for (size_t I = 0; I < NumLanes; ++I) {
@@ -30214,43 +30218,27 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       }
     }
 
-    // For right shifts, (V)PMULHUW needs an extra instruction to handle an
-    // amount of 0, disabling the transformation here to be cautious.
+    // For right shifts, (V)PMULHUW needs 2 extra instructions to handle an
+    // amount of 0, making it unprofitable.
     if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
         any_of(ShiftAmt, [](auto x) { return x == 0; }))
       Profitable = false;
 
     bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
-    // If operand R is not a shuffle by itself, the transformation here adds two
-    // shuffles, adding a non-trivial cost. Here we take out a few cases where
-    // the benefit is questionable according to llvm-mca's modeling.
-    //
-    // Each cell shows latency before/after transform. Here R is not a shuffle.
-    // SSE3
-    //      | v16i8 | v32i8 | v64i8
-    // ----------------------------
-    // SLL  | 17/17 | 20/20 | 26/26
-    // SRL  | 18/17 | 22/20 | 35/26
-    // SRA  | 21/19 | 26/22 | 39/30
-    // AVX2 using VPMUL*W
-    //      | v16i8 | v32i8 | v64i8
-    // ----------------------------
-    // SLL  | 20/18 | 18/18 | 21/21
-    // SRL  | 20/18 | 22/18 | 26/21
-    // SRA  | 20/20 | 22/20 | 25/23
-    // AVX2 using VPS*LVD
-    //      | v16i8 | v32i8 | v64i8
-    // ----------------------------
-    // SLL  | 20/16 | 18/16 | 21/20
-    // SRL  | 20/16 | 22/16 | 26/20
-    // SRA  | 20/18 | 22/18 | 25/22
+    // If operand R is a shuffle, one of the two shuffles introduced by this
+    // transformation can be merged with it, and the extrast shuffle is 1 cycle.
+    // This is generally profitable because it eliminates one (or both) vector
+    // multiplication, which has to be scheduled at least 1 cycle apart.
+    // If operand R is not a shuffle, several cases are not profitable based on
+    // pipeline modeling, so we are excluding them here.
     if (!IsOperandShuffle) {
-      if (Subtarget.hasAVX2()) {
-        if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
+      // A hack to detect AMD CPU.
+      if (Subtarget.hasSSE4A() && Opc == ISD::SRA) {
+        if (Opc == ISD::SRA)
           Profitable = false;
       } else {
-        if (Opc == ISD::SHL ||
-            ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
+        if ((Subtarget.hasAVX() && !Subtarget.hasAVX2()) ||
+            (Subtarget.hasAVX2() && !IsAdjacentQuads))
           Profitable = false;
       }
     }
@@ -30258,7 +30246,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     // Found a permutation P that can rearrange the shift amouts into adjacent
     // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
     if (Profitable) {
-      SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
+      SDValue InnerShuffle =
+          DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
       SmallVector<SDValue, 64> NewShiftAmt;
       for (int Index : Permutation) {
         NewShiftAmt.push_back(Amt.getOperand(Index));
@@ -30267,7 +30256,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
         SDValue Even = NewShiftAmt[I];
         SDValue Odd = NewShiftAmt[I + 1];
-        assert(Even.isUndef() || Odd.isUndef() || Even->getAsZExtVal() == Odd->getAsZExtVal());
+        assert(Even.isUndef() || Odd.isUndef() ||
+               Even->getAsZExtVal() == Odd->getAsZExtVal());
       }
 #endif
       SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
@@ -30276,7 +30266,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       for (size_t I = 0; I < Permutation.size(); ++I) {
         InversePermutation[Permutation[I]] = I;
       }
-      SDValue OuterShuffle = DAG.getVectorShuffle(VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
+      SDValue OuterShuffle = DAG.getVectorShuffle(
+          VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
       return OuterShuffle;
     }
   }

>From 0c7f8f286f9c66c52eea11cb95b4637ea1fa629d Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Fri, 13 Dec 2024 17:15:07 -0500
Subject: [PATCH 5/8] added test cases

---
 llvm/test/CodeGen/X86/vector-shift-widen.ll | 306 ++++++++++++++++++++
 1 file changed, 306 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/vector-shift-widen.ll

diff --git a/llvm/test/CodeGen/X86/vector-shift-widen.ll b/llvm/test/CodeGen/X86/vector-shift-widen.ll
new file mode 100644
index 00000000000000..556f34719b6c48
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-shift-widen.ll
@@ -0,0 +1,306 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,-avx,-avx2 | FileCheck %s --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+sse4a | FileCheck %s --check-prefix=ZNVER1
+;
+; Check the permutation of a variable shift with i8 vector into a widened shift.
+;
+
+; Transform only occurs on SSSE3 because operand is not a shuffle, and shift
+; amounts cannot be rearranged to quads. Not checking the correctness of
+; untransformed variants here as they are covered by other vector shift checks.
+define <16 x i8> @shl_v16i8(<16 x i8> %a) {
+; SSSE3-LABEL: shl_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [8,1,2,12,4,5,6,7,0,9,10,11,3,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,1,1,8,1,16,32]
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    retq
+;
+; AVX-LABEL: shl_v16i8:
+; AVX:         # %bb.0:
+; AVX-NOT:       pshufb
+; AVX-NOT:       vpshufb
+; AVX:           retq
+;
+; AVX2-LABEL: shl_v16i8:
+; AVX2:        # %bb.0:
+; AVX2-NOT:      pshufb
+; AVX2-NOT:      vpshufb
+; AVX2:          retq
+  %shift = shl <16 x i8> %a, <i8 3, i8 0, i8 2, i8 4, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 3, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 5>
+  ret <16 x i8> %shift
+}
+
+define <16 x i8> @lshr_v16i8(<16 x i8> %a) {
+; SSSE3-LABEL: lshr_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[2,1,4,3,6,5,8,7,10,9,12,11,14,13,0,15]
+; SSSE3-NEXT:    pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,2048,8192,16384,32768,8192,2048,4096]
+; SSSE3-NEXT:    pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15]
+; SSSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    retq
+;
+; AVX-LABEL: lshr_v16i8:
+; AVX:         # %bb.0:
+; AVX-NOT:       pshufb
+; AVX-NOT:       vpshufb
+; AVX:           retq
+;
+; AVX2-LABEL: lshr_v16i8:
+; AVX2:        # %bb.0:
+; AVX2-NOT:      pshufb
+; AVX2-NOT:      vpshufb
+; AVX2:          retq
+  %shift = lshr <16 x i8> %a, <i8 4, i8 2, i8 2, i8 5, i8 5, i8 3, i8 3, i8 2, i8 2, i8 1, i8 1, i8 3, i8 3, i8 5, i8 5, i8 4>
+  ret <16 x i8> %shift
+}
+
+define <16 x i8> @ashr_v16i8(<16 x i8> %a) {
+; SSSE3-LABEL: ashr_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,12,2,3,4,9,11,7,8,13,10,6,1,14,5,15]
+; SSSE3-NEXT:    pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,8192,512,8192,4096,1024,32768,2048]
+; SSSE3-NEXT:    pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,12,2,3,4,14,11,7,8,5,10,6,1,9,13,15]
+; SSSE3-NEXT:    pand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [32,64,16,16,1,4,2,16,8,1,u,16,32,8,64,4]
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    psubb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; AVX-LABEL: ashr_v16i8:
+; AVX:         # %bb.0:
+; AVX-NOT:       pshufb
+; AVX-NOT:       vpshufb
+; AVX:           retq
+;
+; AVX2-LABEL: ashr_v16i8:
+; AVX2:        # %bb.0:
+; AVX2-NOT:      pshufb
+; AVX2-NOT:      vpshufb
+; AVX2:          retq
+  %shift = ashr <16 x i8> %a, <i8 2, i8 1, i8 3, i8 3, i8 7, i8 5, i8 6, i8 3, i8 4, i8 7, i8 undef, i8 3, i8 2, i8 4, i8 1, i8 5>
+  ret <16 x i8> %shift
+}
+
+; Shift amounts cannot be paired.
+define <16 x i8> @not_shl_v16i8(<16 x i8> %a) {
+; SSSE3-LABEL: not_shl_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NOT:     pshufb
+; SSSE3-NOT:     vpshufb
+; SSSE3:         retq
+;
+; AVX-LABEL: not_shl_v16i8:
+; AVX:         # %bb.0:
+; AVX-NOT:       pshufb
+; AVX-NOT:       vpshufb
+; AVX:           retq
+;
+; AVX2-LABEL: not_shl_v16i8:
+; AVX2:        # %bb.0:
+; AVX2-NOT:      pshufb
+; AVX2-NOT:      vpshufb
+; AVX2:          retq
+  %shift = shl <16 x i8> %a, <i8 2, i8 1, i8 3, i8 0, i8 7, i8 5, i8 6, i8 4, i8 2, i8 1, i8 3, i8 0, i8 7, i8 5, i8 6, i8 5>
+  ret <16 x i8> %shift
+}
+
+; Right shift amounts containing zero and cannot form quads.
+define <16 x i8> @not_lshr_v16i8(<16 x i8> %a) {
+; SSSE3-LABEL: not_lshr_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NOT:     pshufb
+; SSSE3-NOT:     vpshufb
+; SSSE3:         retq
+;
+; AVX-LABEL: not_lshr_v16i8:
+; AVX:         # %bb.0:
+; AVX-NOT:       pshufb
+; AVX-NOT:       vpshufb
+; AVX:           retq
+;
+; AVX2-LABEL: not_lshr_v16i8:
+; AVX2:        # %bb.0:
+; AVX2-NOT:      pshufb
+; AVX2-NOT:      vpshufb
+; AVX2:          retq
+  %shift = lshr <16 x i8> %a, <i8 4, i8 2, i8 2, i8 5, i8 5, i8 3, i8 3, i8 2, i8 2, i8 1, i8 1, i8 0, i8 0, i8 5, i8 5, i8 4>
+  ret <16 x i8> %shift
+}
+
+; Shift cannot form quads and operand is not shuffle, only transform on SSSE3.
+define <32 x i8> @shl_v32i8(<32 x i8> %a) {
+; SSSE3-LABEL: shl_v32i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # xmm2 = [0,2,1,3,6,5,4,7,8,9,12,11,10,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # xmm3 = [1,4,8,2,16,32,64,16]
+; SSSE3-NEXT:    pmullw %xmm3, %xmm0
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # xmm4 = [255,252,255,252,254,248,248,254,240,240,192,224,224,192,240,240]
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pmullw %xmm3, %xmm1
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    retq
+;
+; AVX-LABEL: shl_v32i8:
+; AVX:         # %bb.0:
+; AVX-NOT:       pshufb
+; AVX-NOT:       vpshufb
+; AVX:           retq
+;
+; AVX2-LABEL: shl_v32i8:
+; AVX2:        # %bb.0:
+; AVX2-NOT:      pshufb
+; AVX2-NOT:      vpshufb
+; AVX2:          retq
+  %shift = shl <32 x i8> %a, <i8 0, i8 2, i8 0, i8 2, i8 1, i8 3, i8 3, i8 1, i8 4, i8 4, i8 6, i8 5, i8 5, i8 6, i8 4, i8 4,
+                              i8 0, i8 2, i8 0, i8 2, i8 1, i8 3, i8 3, i8 1, i8 4, i8 4, i8 6, i8 5, i8 5, i8 6, i8 4, i8 4>
+  ret <32 x i8> %shift
+}
+
+; For quads only testing on AVX2 as it has vps**vd.
+define <32 x i8> @shl_v32i8_quad(<32 x i8> %a) {
+; AVX2-LABEL: shl_v32i8_quad:
+; AVX2:        # %bb.0:
+; AVX2-NEXT:     vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
+; AVX2-NEXT:     vpsllvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:     vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
+; AVX2-NEXT:     vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:     retq
+;
+; ZNVER1-LABEL: shl_v32i8_quad:
+; ZNVER1:      # %bb.0:
+; ZNVER1-NEXT:   vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
+; ZNVER1-NEXT:   vpsllvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; ZNVER1-NEXT:   vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
+; ZNVER1-NEXT:   vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; ZNVER1-NEXT:   retq
+  %shift = shl <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2,
+                              i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1>
+  ret <32 x i8> %shift
+}
+
+define <32 x i8> @lshr_v32i8_quad(<32 x i8> %a) {
+; AVX2-LABEL: lshr_v32i8_quad:
+; AVX2:        # %bb.0:
+; AVX2-NEXT:     vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
+; AVX2-NEXT:     vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:     vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
+; AVX2-NEXT:     vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:     retq
+;
+; ZNVER1-LABEL: lshr_v32i8_quad:
+; ZNVER1:      # %bb.0:
+; ZNVER1-NEXT:   vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
+; ZNVER1-NEXT:   vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; ZNVER1-NEXT:   vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
+; ZNVER1-NEXT:   vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; ZNVER1-NEXT:   retq
+  %shift = lshr <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2,
+                               i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1>
+  ret <32 x i8> %shift
+}
+
+; Disabling the transform for AMD Zen because it can schedule two vpmullw 2
+; cycles faster compared to Intel.
+define <32 x i8> @ashr_v32i8_quad(<32 x i8> %a) {
+; AVX2-LABEL: ashr_v32i8_quad:
+; AVX2:        # %bb.0:
+; AVX2-NEXT:     vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
+; AVX2-NEXT:     vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:     vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
+; AVX2-NEXT:     vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:     vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 # ymm1 = [128,32,8,2,8,128,2,32,32,128,8,2,2,128,8,32,64,16,4,1,64,16,4,1,1,4,16,64,1,4,16,64]
+; AVX2-NEXT:     vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:     vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:     retq
+;
+; ZNVER1-LABEL: ashr_v32i8_quad:
+; ZNVER1:      # %bb.0:
+; ZNVER1-NOT:    pshufb
+; ZNVER1-NOT:    vpshufb
+; ZNVER1:        retq
+  %shift = ashr <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2,
+                               i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1>
+  ret <32 x i8> %shift
+}
+
+; Shift amounts cannot be paired in lane.
+define <32 x i8> @not_shl_v32i8(<32 x i8> %a) {
+; SSSE3-LABEL: not_shl_v32i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NOT:     pshufb
+; SSSE3-NOT:     vpshufb
+; SSSE3:         retq
+;
+; AVX-LABEL: not_shl_v32i8:
+; AVX:         # %bb.0:
+; AVX-NOT:       pshufb
+; AVX-NOT:       vpshufb
+; AVX:           retq
+;
+; AVX2-LABEL: not_shl_v32i8:
+; AVX2:        # %bb.0:
+; AVX2-NOT:      pshufb
+; AVX2-NOT:      vpshufb
+; AVX2:          retq
+  %shift = shl <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 3,
+                              i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3>
+  ret <32 x i8> %shift
+}
+
+; Always transform if operand is shuffle and shift amounts can be paired.
+define <16 x i8> @lshr_shuffle_v16i8(<16 x i8> %a) {
+; SSSE3-LABEL: lshr_shuffle_v16i8:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15]
+; SSSE3-NEXT:    pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048]
+; SSSE3-NEXT:    pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]
+; SSSE3-NEXT:    pand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4]
+; SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSSE3-NEXT:    psubb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; AVX-LABEL: lshr_shuffle_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15]
+; AVX-NEXT:    vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048]
+; AVX-NEXT:    vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]
+; AVX-NEXT:    vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4]
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: lshr_shuffle_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15]
+; AVX2-NEXT:    vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048]
+; AVX2-NEXT:    vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]
+; AVX2-NEXT:    vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; ZNVER1-LABEL: lshr_shuffle_v16i8:
+; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15]
+; ZNVER1-NEXT:    vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048]
+; ZNVER1-NEXT:    vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]
+; ZNVER1-NEXT:    vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; ZNVER1-NEXT:    vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4]
+; ZNVER1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; ZNVER1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
+; ZNVER1-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  %shift = ashr <16 x i8> %shuffle, <i8 1, i8 2, i8 1, i8 2, i8 2, i8 3, i8 2, i8 3, i8 3, i8 4, i8 3, i8 4, i8 4, i8 5, i8 4, i8 5>
+  ret <16 x i8> %shift
+}

>From 3268bde02575b1bcc8036c71985c680084fec1bf Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Sun, 15 Dec 2024 00:50:01 -0500
Subject: [PATCH 6/8] fixed corner cases with shift amt > 8 or undef Updated
 affected tests

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  26 ++--
 llvm/test/CodeGen/X86/combine-sdiv.ll         | 116 +++++++-----------
 llvm/test/CodeGen/X86/vector-fshr-128.ll      |  10 +-
 llvm/test/CodeGen/X86/vector-mul.ll           |  29 ++---
 .../CodeGen/X86/vector-shift-ashr-sub128.ll   |  61 +++++----
 .../CodeGen/X86/vector-shift-lshr-sub128.ll   |  33 +++--
 llvm/test/CodeGen/X86/vector-shift-shl-128.ll |   9 +-
 .../CodeGen/X86/vector-shift-shl-sub128.ll    |  45 ++++---
 llvm/test/CodeGen/X86/vector-shift-widen.ll   |   2 +-
 9 files changed, 155 insertions(+), 176 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 879f98708894cb..72307cbd99d680 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29781,7 +29781,7 @@ template <typename InputTy, typename PermutationTy,
                              8>>
 static bool PermuteAndPairVector(
     const InputTy &Inputs, PermutationTy &Permutation,
-    MapTy UnpairedInputs = MapTy()) {static_assert(std::is_same<typename InputTy::value_type, uint8_t>::value);
+    MapTy UnpairedInputs = MapTy()) {
   const typename InputTy::value_type Wildcard = ~0;
   SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
 
@@ -30160,10 +30160,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   // widened, and P2^-1 is the inverse shuffle of P2.
   // This is not profitable on XOP or AVX512 becasue it has 8/16-bit vector
   // variable shift instructions.
+  // Picking out GFNI because normally it implies AVX512, and there is no
+  // latency data for CPU with GFNI and SSE or AVX only, but there are tests for
+  // such combination anyways.
   if (ConstantAmt &&
       (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
       R.hasOneUse() && Subtarget.hasSSSE3() && !Subtarget.hasAVX512() &&
-      !Subtarget.hasXOP()) {
+      !Subtarget.hasXOP() && !Subtarget.hasGFNI()) {
     constexpr size_t LaneBytes = 16;
     const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
 
@@ -30172,8 +30175,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
       if (Amt.getOperand(I).isUndef())
         ShiftAmt.push_back(~0);
-      else
-        ShiftAmt.push_back(Amt.getConstantOperandVal(I));
+      else {
+        auto A = Amt.getConstantOperandVal(I);
+        ShiftAmt.push_back(A > 8 ? 8 : A);
+      }
     }
 
     // Check if we can find an in-lane shuffle to rearrange the shift amounts,
@@ -30221,7 +30226,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     // For right shifts, (V)PMULHUW needs 2 extra instructions to handle an
     // amount of 0, making it unprofitable.
     if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
-        any_of(ShiftAmt, [](auto x) { return x == 0; }))
+        any_of(ShiftAmt, [](uint8_t x) { return x == 0; }))
       Profitable = false;
 
     bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
@@ -30252,14 +30257,17 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       for (int Index : Permutation) {
         NewShiftAmt.push_back(Amt.getOperand(Index));
       }
-#ifndef NDEBUG
+      // If using (V)PMULHUW, any undef pair is resolved to shift by 8 so that
+      // it does not create extra instructions in case it is resolved to 0.
       for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
-        SDValue Even = NewShiftAmt[I];
-        SDValue Odd = NewShiftAmt[I + 1];
+        SDValue &Even = NewShiftAmt[I];
+        SDValue &Odd = NewShiftAmt[I + 1];
         assert(Even.isUndef() || Odd.isUndef() ||
                Even->getAsZExtVal() == Odd->getAsZExtVal());
+        if (!IsAdjacentQuads && Even.isUndef() && Odd.isUndef())
+          Even = DAG.getConstant(8, dl, VT.getScalarType());
       }
-#endif
+
       SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
       SDValue NewShift = DAG.getNode(Opc, dl, VT, InnerShuffle, NewShiftVector);
       SmallVector<int, 64> InversePermutation(Permutation.size());
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 2b392e69297f07..b14c839a6f1f11 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -351,32 +351,20 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2]
-; SSE41-NEXT:    pmullw %xmm0, %xmm3
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pmullw %xmm0, %xmm2
-; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[9,1,2,7,4,12,11,3,8,0,14,6,5,13,10,15]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1024,512,2048,4096,256,16384,8192,512]
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    paddb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT:    psraw $8, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
-; SSE41-NEXT:    pmullw %xmm3, %xmm0
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    psraw $8, %xmm2
-; SSE41-NEXT:    pmullw %xmm3, %xmm2
-; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm0, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [16384,32768,8192,4096,256,1024,2048,32768]
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32,32,64,64,16,16,8,8,u,u,2,2,4,4,64,64]
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    psubb %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = zero,xmm2[1,2,7,4,12,11,3],zero,xmm2[0,14,6,5,13,10,15]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
@@ -2184,39 +2172,23 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; SSE41-LABEL: non_splat_minus_one_divisor_1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    psllw $1, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
-; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,6,4,5,3,7,12,9,10,11,15,13,14,8]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,512,256,256,512,512,32768,512]
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    paddb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT:    psraw $8, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psllw $1, %xmm3
-; SSE41-NEXT:    psllw $7, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    psraw $8, %xmm2
-; SSE41-NEXT:    psllw $7, %xmm2
-; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm0, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
-; SSE41-NEXT:    pxor %xmm0, %xmm1
-; SSE41-NEXT:    psubb %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,32768,256,256,32768,32768,512,32768]
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,u,64,64,u,u,u,u,64,64,64,64,1,1,64,u]
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    psubb %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,xmm2[9,10,11,8,13,14,12]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5],zero,xmm0[7,8],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    psubb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: non_splat_minus_one_divisor_1:
@@ -2253,25 +2225,23 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ;
 ; AVX2-LABEL: non_splat_minus_one_divisor_1:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,2,256,256,256,2,256,256,2,2,2,2,128,2,128]
-; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,128,256,256,256,128,256,256,128,128,128,128,2,128,2]
-; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[14,8,2,6,4,5,3,7,12,9,10,11,15,13,0,1]
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [64,64,64,64,1,1,0,0,64,64,64,64,1,1,0,0]
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,xmm1[9,10,11,8,13,0,12]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5],zero,xmm0[7,8],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: non_splat_minus_one_divisor_1:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 7903781d63523a..b82587a06b580e 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -2010,13 +2010,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    packuswb %xmm1, %xmm3
 ; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
-; SSE41-NEXT:    psllw $8, %xmm1
-; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; SSE41-NEXT:    pshufb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,8,13,3,11,5,9,7,10,6,12,4,14,2,1,15]
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,16,4,1,2,8,32,64]
+; SSE41-NEXT:    pshufb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,14,13,3,11,5,9,7,1,6,8,4,10,2,12,15]
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    por %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 19bbf7dc0a0e1a..dbf38ec73c6ee7 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -262,22 +262,20 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw
 ;
 ; X86-SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
 ; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8]
-; X86-SSE4-NEXT:    psllw $8, %xmm1
-; X86-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0]
+; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [0,4,2,6,1,5,3,7,8,12,10,14,9,13,11,15]
+; X86-SSE4-NEXT:    pshufb %xmm1, %xmm0
+; X86-SSE4-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,4,2,8,1,4,2,8]
+; X86-SSE4-NEXT:    pshufb %xmm1, %xmm0
 ; X86-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE4-NEXT:    por %xmm1, %xmm0
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE4-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8]
-; X64-SSE4-NEXT:    psllw $8, %xmm1
-; X64-SSE4-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0]
+; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [0,4,2,6,1,5,3,7,8,12,10,14,9,13,11,15]
+; X64-SSE4-NEXT:    pshufb %xmm1, %xmm0
+; X64-SSE4-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,2,8,1,4,2,8]
+; X64-SSE4-NEXT:    pshufb %xmm1, %xmm0
 ; X64-SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE4-NEXT:    por %xmm1, %xmm0
 ; X64-SSE4-NEXT:    retq
 ;
 ; X64-XOP-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
@@ -287,12 +285,11 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw
 ;
 ; X64-AVX2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; X64-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8]
-; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15]
+; X64-AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX2-NEXT:    retq
 ;
 ; X64-AVX512DQ-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index 2ec9de0cb447f5..b58ab80f475ed3 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -2042,12 +2042,13 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ;
 ; AVX2-LABEL: constant_shift_v4i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256]
-; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,8,9,1,15,6,7,2,12,10,11,3,13,4,5]
+; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,14,15,6,7,2,3,10,11,9,13,1,5]
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [128,64,32,16,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: constant_shift_v4i8:
@@ -2105,17 +2106,29 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 }
 
 define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE-LABEL: constant_shift_v2i8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT:    psraw $8, %xmm0
-; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256]
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    packuswb %xmm2, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: constant_shift_v2i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256]
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: constant_shift_v2i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,256,256,256,256,256,256,8192]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    movd {{.*#+}} xmm1 = [32,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    psubb %xmm1, %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v2i8:
 ; AVX1:       # %bb.0:
@@ -2130,12 +2143,14 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ;
 ; AVX2-LABEL: constant_shift_v2i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
-; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [32,16,16,0,32,0,16,0,32,0,16,0,32,0,0,0]
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: constant_shift_v2i8:
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index fe349e9ff995d9..f6291ea4ae45c9 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1744,12 +1744,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ;
 ; AVX2-LABEL: constant_shift_v4i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256]
-; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,8,9,1,15,6,7,2,12,10,11,3,13,4,5]
+; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,14,15,6,7,2,3,10,11,9,13,1,5]
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: constant_shift_v4i8:
@@ -1819,13 +1817,11 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v2i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,256,256,256,256,256,256]
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    packuswb %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,256,256,256,256,256,256,8192]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v2i8:
@@ -1840,12 +1836,11 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ;
 ; AVX2-LABEL: constant_shift_v2i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
-; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: constant_shift_v2i8:
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 902bf8a0e55ce8..0e20d83d817596 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -1162,12 +1162,11 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
-; SSE41-NEXT:    psllw $8, %xmm1
-; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [14,1,12,3,10,5,8,7,6,9,4,11,2,13,0,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,8,32,128,64,16,4,1]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index a44120b6d038ce..21b6f301d58c3c 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -1437,11 +1437,11 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v8i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,14,2,12,4,10,6,8,7,9,5,11,3,13,1,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,16,64,128,32,8,2]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    packuswb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v8i8:
@@ -1526,11 +1526,11 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v4i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,u,u,u,u]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,14,2,12,4,5,6,7,8,9,10,11,3,13,1,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,256,256,256,256,8,2]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    packuswb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v4i8:
@@ -1544,12 +1544,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
 ;
 ; AVX2-LABEL: constant_shift_v4i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,8,9,1,15,6,7,2,12,10,11,3,13,4,5]
+; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,14,15,6,7,2,3,10,11,9,13,1,5]
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: constant_shift_v4i8:
@@ -1615,11 +1613,11 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v2i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,u,u,u,u,u,u]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,256,256,256,256,256,256,8]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    packuswb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: constant_shift_v2i8:
@@ -1633,12 +1631,11 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
 ;
 ; AVX2-LABEL: constant_shift_v2i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [4,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,14,2,3,4,5,6,7,8,9,10,11,12,13,1,15]
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-LABEL: constant_shift_v2i8:
diff --git a/llvm/test/CodeGen/X86/vector-shift-widen.ll b/llvm/test/CodeGen/X86/vector-shift-widen.ll
index 556f34719b6c48..20af073a152d1a 100644
--- a/llvm/test/CodeGen/X86/vector-shift-widen.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-widen.ll
@@ -14,7 +14,7 @@ define <16 x i8> @shl_v16i8(<16 x i8> %a) {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [8,1,2,12,4,5,6,7,0,9,10,11,3,13,14,15]
 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
-; SSSE3-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,1,1,8,1,16,32]
+; SSSE3-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,256,256,8,256,16,32]
 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
 ; SSSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    retq

>From b8a731d646daf3ce3c20f7ab996a06edd43d00d8 Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Mon, 16 Dec 2024 19:27:17 -0500
Subject: [PATCH 7/8] added safeguard to prevent the transformation being
 applied recursively

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 72307cbd99d680..0fe71e51507267 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30248,6 +30248,11 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       }
     }
 
+    // If the shuffle is identity, do not insert it. It also prevents this
+    // transformation from being applied recursively.
+    if (llvm::equal(Permutation, llvm::seq(Permutation.size())))
+      Profitable = false;
+
     // Found a permutation P that can rearrange the shift amouts into adjacent
     // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
     if (Profitable) {

>From efeb5f388c16b1135f06c829ea58bca9db15a126 Mon Sep 17 00:00:00 2001
From: William Huang <williamjhuang at google.com>
Date: Fri, 20 Dec 2024 00:06:36 -0500
Subject: [PATCH 8/8] Update cases to apply this transformation for AMD CPU
 after finding different latency on AMD Zen+, 2 and 3 CPU

---
 llvm/lib/Target/X86/X86ISelLowering.cpp     | 10 +++++++---
 llvm/test/CodeGen/X86/vector-shift-widen.ll | 16 ----------------
 2 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0fe71e51507267..5675d8c20e68c1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30237,9 +30237,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     // If operand R is not a shuffle, several cases are not profitable based on
     // pipeline modeling, so we are excluding them here.
     if (!IsOperandShuffle) {
-      // A hack to detect AMD CPU.
-      if (Subtarget.hasSSE4A() && Opc == ISD::SRA) {
-        if (Opc == ISD::SRA)
+      // A hack to detect AMD Zen series CPU.
+      if (Subtarget.hasSSE4A()) {
+        if (!IsAdjacentQuads)
+          Profitable = false;
+        // A hack to detect Zen+ and Zen 2, because VPSRLVD is 2 cycles slower
+        // than in Zen 3, so this transformation should not be used.
+        else if (!Subtarget.hasVAES())
           Profitable = false;
       } else {
         if ((Subtarget.hasAVX() && !Subtarget.hasAVX2()) ||
diff --git a/llvm/test/CodeGen/X86/vector-shift-widen.ll b/llvm/test/CodeGen/X86/vector-shift-widen.ll
index 20af073a152d1a..ec2441c04cfb92 100644
--- a/llvm/test/CodeGen/X86/vector-shift-widen.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-widen.ll
@@ -173,14 +173,6 @@ define <32 x i8> @shl_v32i8_quad(<32 x i8> %a) {
 ; AVX2-NEXT:     vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
 ; AVX2-NEXT:     vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:     retq
-;
-; ZNVER1-LABEL: shl_v32i8_quad:
-; ZNVER1:      # %bb.0:
-; ZNVER1-NEXT:   vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
-; ZNVER1-NEXT:   vpsllvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; ZNVER1-NEXT:   vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
-; ZNVER1-NEXT:   vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; ZNVER1-NEXT:   retq
   %shift = shl <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2,
                               i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1>
   ret <32 x i8> %shift
@@ -194,14 +186,6 @@ define <32 x i8> @lshr_v32i8_quad(<32 x i8> %a) {
 ; AVX2-NEXT:     vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
 ; AVX2-NEXT:     vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:     retq
-;
-; ZNVER1-LABEL: lshr_v32i8_quad:
-; ZNVER1:      # %bb.0:
-; ZNVER1-NEXT:   vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
-; ZNVER1-NEXT:   vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; ZNVER1-NEXT:   vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
-; ZNVER1-NEXT:   vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; ZNVER1-NEXT:   retq
   %shift = lshr <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2,
                                i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1>
   ret <32 x i8> %shift



More information about the llvm-commits mailing list