[llvm] [X86][Codegen] Shuffle certain shifts on i8 vectors to create opportunity for vectorized shift instructions (PR #117980)

Sat Dec 14 21:51:09 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: William Huang (huangjd)

<details>
<summary>Changes</summary>

Vectorized shift instructions are not available for i8 type. The current typical way to handle a shift on i8 vector is to use 2 vector i16 multiply to get the even and odd bytes separately and then combine them. If shift amount is a constant vector and we can somehow shuffle the constant vector so that each pair or quad of adjacent elements has the same value, we can obtain the result by using vector shift on widened type and then a vector  AND to clear the bits supposed to be shifted out of a byte. This is typically faster than using vector multiply, as long as the shuffle itself is also fast (because we need to shuffle the operand before and after back to its original order).

---

Patch is 52.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117980.diff


9 Files Affected:

- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+237-1) 
- (modified) llvm/test/CodeGen/X86/combine-sdiv.ll (+43-73) 
- (modified) llvm/test/CodeGen/X86/vector-fshr-128.ll (+4-6) 
- (modified) llvm/test/CodeGen/X86/vector-mul.ll (+13-16) 
- (modified) llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll (+38-23) 
- (modified) llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll (+14-19) 
- (modified) llvm/test/CodeGen/X86/vector-shift-shl-128.ll (+4-5) 
- (modified) llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll (+21-24) 
- (added) llvm/test/CodeGen/X86/vector-shift-widen.ll (+306) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1c790f3813b7a4..5444d9a91da99c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,7 +28,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -29766,6 +29765,113 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
   return SDValue();
 }
 
+// Given a vector of values, find a permutation such that every adjacent even-
+// odd pair has the same value. ~0 is reserved as a special value for wildcard,
+// which can be paired with any value. Returns true if a permutation is found.
+// If output Permutation is not empty, permutation index starts at its previous
+// size, so that this function can concatenate the result of multiple calls.
+// UnpairedInputs contains values yet to be paired, mapping an unpaired value to
+// its current neighbor's value and index.
+// Do not use llvm::DenseMap as ~0 is reserved key.
+template <typename InputTy, typename PermutationTy,
+          typename MapTy =
+              SmallMapVector<typename InputTy::value_type,
+                             std::pair<typename InputTy::value_type,
+                                       typename PermutationTy::value_type>,
+                             8>>
+static bool PermuteAndPairVector(
+    const InputTy &Inputs, PermutationTy &Permutation,
+    MapTy UnpairedInputs = MapTy()) {static_assert(std::is_same<typename InputTy::value_type, uint8_t>::value);
+  const typename InputTy::value_type Wildcard = ~0;
+  SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
+
+  size_t OutputOffset = Permutation.size();
+  typename PermutationTy::value_type I = 0;
+  for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end();
+       InputIt != InputEnd;) {
+    Permutation.push_back(OutputOffset + I);
+    Permutation.push_back(OutputOffset + I + 1);
+
+    auto Even = *InputIt++;
+    assert(InputIt != InputEnd && "Expected even number of elements");
+    auto Odd = *InputIt++;
+
+    // If both are wildcards, note it for later use by unpairable values.
+    if (Even == Wildcard && Odd == Wildcard) {
+      WildcardPairs.push_back(I);
+    }
+
+    // If both are equal, they are in good position.
+    if (Even != Odd) {
+      auto DoWork = [&](auto &This, auto ThisIndex, auto Other,
+                        auto OtherIndex) {
+        if (This != Wildcard) {
+          // For non-wildcard value, check if it can pair with an exisiting
+          // unpaired value from UnpairedInputs, if so, swap with the unpaired
+          // value's neighbor, otherwise the current value is added to the map.
+          if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(
+                  This, std::make_pair(Other, OtherIndex));
+              !Inserted) {
+            auto [SwapValue, SwapIndex] = MapIt->second;
+            std::swap(Permutation[OutputOffset + SwapIndex],
+                      Permutation[OutputOffset + ThisIndex]);
+            This = SwapValue;
+            UnpairedInputs.erase(MapIt);
+
+            if (This == Other) {
+              if (This == Wildcard) {
+                // We freed up a wildcard pair by pairing two non-adjacent
+                // values, note it for later use by unpairable values.
+                WildcardPairs.push_back(I);
+              } else {
+                // The swapped element also forms a pair with Other, so it can
+                // be removed from the map.
+                assert(UnpairedInputs.count(This));
+                UnpairedInputs.erase(This);
+              }
+            } else {
+              // Swapped in an unpaired value, update its info.
+              if (This != Wildcard) {
+                assert(UnpairedInputs.count(This));
+                UnpairedInputs[This] = std::make_pair(Other, OtherIndex);
+              }
+              // If its neighbor is also in UnpairedInputs, update its info too.
+              if (auto OtherMapIt = UnpairedInputs.find(Other);
+                  OtherMapIt != UnpairedInputs.end() &&
+                  OtherMapIt->second.second == ThisIndex) {
+                OtherMapIt->second.first = This;
+              }
+            }
+          }
+        }
+      };
+      DoWork(Even, I, Odd, I + 1);
+      if (Even != Odd) {
+        DoWork(Odd, I + 1, Even, I);
+      }
+    }
+    I += 2;
+  }
+
+  // Now check if each remaining unpaired neighboring values can be swapped with
+  // a wildcard pair to form two paired values.
+  for (auto &[Unpaired, V] : UnpairedInputs) {
+    auto [Neighbor, NeighborIndex] = V;
+    if (Neighbor != Wildcard) {
+      assert(UnpairedInputs.count(Neighbor));
+      if (WildcardPairs.size()) {
+        std::swap(Permutation[OutputOffset + WildcardPairs.back()],
+                  Permutation[OutputOffset + NeighborIndex]);
+        WildcardPairs.pop_back();
+        // Mark the neighbor as processed.
+        UnpairedInputs[Neighbor].first = Wildcard;
+      } else
+        return false;
+    }
+  }
+  return true;
+}
+
 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
@@ -30044,6 +30150,136 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
+  // SHL/SRL/SRA on vXi8 can be widened to vYi16 or vYi32 if the constant
+  // amounts can be shuffled such that every pair or quad of adjacent elements
+  // has the same value. This introduces an extra shuffle before and after the
+  // shift, and it is profitable if the operand is aready a shuffle so that both
+  // can be merged or the extra shuffle is fast.
+  // (shift (shuffle X P1) S1) ->
+  // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
+  // widened, and P2^-1 is the inverse shuffle of P2.
+  // This is not profitable on XOP or AVX512 becasue it has 8/16-bit vector
+  // variable shift instructions.
+  // Picking out GFNI because normally it implies AVX512, and there is no
+  // latency data for CPU with GFNI and SSE or AVX only, but there are tests for
+  // such combination anyways.
+  if (ConstantAmt &&
+      (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
+      R.hasOneUse() && Subtarget.hasSSSE3() && !Subtarget.hasAVX512() &&
+      !Subtarget.hasXOP() && !Subtarget.hasGFNI()) {
+    constexpr size_t LaneBytes = 16;
+    const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
+
+    SmallVector<int, 64> Permutation;
+    SmallVector<uint8_t, 64> ShiftAmt;
+    for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
+      if (Amt.getOperand(I).isUndef())
+        ShiftAmt.push_back(~0);
+      else {
+        auto A = Amt.getConstantOperandVal(I);
+        ShiftAmt.push_back(A > 8 ? 8 : A);
+      }
+    }
+
+    // Check if we can find an in-lane shuffle to rearrange the shift amounts,
+    // if so, this transformation may be profitable. Cross-lane shuffle is
+    // almost never profitable because there is no general 1-instruction
+    // solution.
+    bool Profitable;
+    for (size_t I = 0; I < NumLanes; ++I) {
+      if (!(Profitable = PermuteAndPairVector(
+                ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
+        break;
+    }
+
+    // For AVX2, check if we can further rearrange shift amounts into adjacent
+    // quads, so that it can use VPS*LVD instead of VPMUL*W as it is 2 cycles
+    // faster.
+    bool IsAdjacentQuads = false;
+    if (Profitable && Subtarget.hasAVX2()) {
+      SmallVector<uint8_t, 64> EveryOtherShiftAmt;
+      for (size_t I = 0; I < Permutation.size(); I += 2) {
+        uint8_t Shift1 = ShiftAmt[Permutation[I]];
+        uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
+        assert(Shift1 == Shift2 || Shift1 == (uint8_t) ~0 ||
+               Shift2 == (uint8_t) ~0);
+        EveryOtherShiftAmt.push_back(Shift1 != (uint8_t) ~0 ? Shift1 : Shift2);
+      }
+      SmallVector<int, 32> Permutation2;
+      for (size_t I = 0; I < NumLanes; ++I) {
+        if (!(IsAdjacentQuads = PermuteAndPairVector(
+                  ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2],
+                           LaneBytes / 2),
+                  Permutation2)))
+          break;
+      }
+      if (IsAdjacentQuads) {
+        SmallVector<int, 64> CombinedPermutation;
+        for (int Index : Permutation2) {
+          CombinedPermutation.push_back(Permutation[Index * 2]);
+          CombinedPermutation.push_back(Permutation[Index * 2 + 1]);
+        }
+        std::swap(Permutation, CombinedPermutation);
+      }
+    }
+
+    // For right shifts, (V)PMULHUW needs 2 extra instructions to handle an
+    // amount of 0, making it unprofitable.
+    if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
+        any_of(ShiftAmt, [](uint8_t x) { return x == 0; }))
+      Profitable = false;
+
+    bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
+    // If operand R is a shuffle, one of the two shuffles introduced by this
+    // transformation can be merged with it, and the extrast shuffle is 1 cycle.
+    // This is generally profitable because it eliminates one (or both) vector
+    // multiplication, which has to be scheduled at least 1 cycle apart.
+    // If operand R is not a shuffle, several cases are not profitable based on
+    // pipeline modeling, so we are excluding them here.
+    if (!IsOperandShuffle) {
+      // A hack to detect AMD CPU.
+      if (Subtarget.hasSSE4A() && Opc == ISD::SRA) {
+        if (Opc == ISD::SRA)
+          Profitable = false;
+      } else {
+        if ((Subtarget.hasAVX() && !Subtarget.hasAVX2()) ||
+            (Subtarget.hasAVX2() && !IsAdjacentQuads))
+          Profitable = false;
+      }
+    }
+
+    // Found a permutation P that can rearrange the shift amouts into adjacent
+    // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
+    if (Profitable) {
+      SDValue InnerShuffle =
+          DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
+      SmallVector<SDValue, 64> NewShiftAmt;
+      for (int Index : Permutation) {
+        NewShiftAmt.push_back(Amt.getOperand(Index));
+      }
+      // If using (V)PMULHUW, any undef pair is resolved to shift by 8 so that
+      // it does not create extra instructions in case it is resolved to 0.
+      for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
+        SDValue &Even = NewShiftAmt[I];
+        SDValue &Odd = NewShiftAmt[I + 1];
+        assert(Even.isUndef() || Odd.isUndef() ||
+               Even->getAsZExtVal() == Odd->getAsZExtVal());
+        if (!IsAdjacentQuads && Even.isUndef() && Odd.isUndef())
+          Even = DAG.getConstant(8, dl, VT.getScalarType());
+      }
+
+      SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
+      SDValue NewShift = DAG.getNode(Opc, dl, VT, InnerShuffle, NewShiftVector);
+      SmallVector<int, 64> InversePermutation(Permutation.size());
+      for (size_t I = 0; I < Permutation.size(); ++I) {
+        InversePermutation[Permutation[I]] = I;
+      }
+      SDValue OuterShuffle = DAG.getVectorShuffle(
+          VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
+      return OuterShuffle;
+    }
+  }
+
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
   // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 2b392e69297f07..b14c839a6f1f11 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -351,32 +351,20 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
 ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2]
-; SSE41-NEXT:    pmullw %xmm0, %xmm3
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    pmullw %xmm0, %xmm2
-; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[9,1,2,7,4,12,11,3,8,0,14,6,5,13,10,15]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1024,512,2048,4096,256,16384,8192,512]
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    paddb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT:    psraw $8, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
-; SSE41-NEXT:    pmullw %xmm3, %xmm0
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    psraw $8, %xmm2
-; SSE41-NEXT:    pmullw %xmm3, %xmm2
-; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm0, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [16384,32768,8192,4096,256,1024,2048,32768]
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32,32,64,64,16,16,8,8,u,u,2,2,4,4,64,64]
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    psubb %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = zero,xmm2[1,2,7,4,12,11,3],zero,xmm2[0,14,6,5,13,10,15]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
@@ -2184,39 +2172,23 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; SSE41-LABEL: non_splat_minus_one_divisor_1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    psllw $1, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
-; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
-; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,6,4,5,3,7,12,9,10,11,15,13,14,8]
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,512,256,256,512,512,32768,512]
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    paddb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT:    psraw $8, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psllw $1, %xmm3
-; SSE41-NEXT:    psllw $7, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT:    psraw $8, %xmm2
-; SSE41-NEXT:    psllw $7, %xmm2
-; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm0, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
-; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
-; SSE41-NEXT:    pxor %xmm0, %xmm1
-; SSE41-NEXT:    psubb %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,32768,256,256,32768,32768,512,32768]
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,u,64,64,u,u,u,u,64,64,64,64,1,1,64,u]
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    psubb %xmm1, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,xmm2[9,10,11,8,13,14,12]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5],zero,xmm0[7,8],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    psubb %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: non_splat_minus_one_divisor_1:
@@ -2253,25 +2225,23 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ;
 ; AVX2-LABEL: non_splat_minus_one_divisor_1:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,2,256,256,256,2,256,256,2,2,2,2,128,2,128]
-; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,128,256,256,256,128,256,256,128,128,128,128,2,128,2]
-; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[14,8,2,6,4,5,3,7,12,9,10,11,15,13,0,1]
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [64,64,64,64,1,1,0,0,64,64,64,64,1,1,0,0]
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,xmm1[9,10,11,8,13,0,12]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5],zero,xmm0[7,8],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsub...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/117980