[llvm] Use shift+add/sub for vXi8 splat multiplies #164200 (PR #174110)

Wed Dec 31 12:48:22 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Cody Cutler (grodranlorth)

<details>
<summary>Changes</summary>

Issue #164200

I will create a separate PR to the `llvm-test-suite` repo for the microbenchmark for this change.

In my experiments on an EC2 `c6i.4xl`, the change gives a small improvement for the `x86-64`, `x86-64-v2`, and `x86-64-v3` targets. It regresses performance on `x86-64-v4` (in particular, when the constant decomposes into two shifts). The performance summary follows:

```
$ ../MicroBenchmarks/libs/benchmark/tools/compare.py  benchmarks results-baseline-generic-v1.json results-opt-generic-v1.json  |tail -n1
OVERALL_GEOMEAN                         -0.2846         -0.2846             0             0             0             0
$ ../MicroBenchmarks/libs/benchmark/tools/compare.py  benchmarks results-baseline-generic-v2.json results-opt-generic-v2.json  |tail -n1
OVERALL_GEOMEAN                         -0.0907         -0.0907             0             0             0             0
$ ../MicroBenchmarks/libs/benchmark/tools/compare.py  benchmarks results-baseline-generic-v3.json results-opt-generic-v3.json  |tail -n1
OVERALL_GEOMEAN                         -0.1821         -0.1821             0             0             0             0
$ ../MicroBenchmarks/libs/benchmark/tools/compare.py  benchmarks results-baseline-generic-v4.json results-opt-generic-v4.json  |tail -n1
OVERALL_GEOMEAN                         +0.0190         +0.0190             0             0             0             0
```

---

Patch is 81.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174110.diff


6 Files Affected:

- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+17) 
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+24) 
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+59) 
- (modified) llvm/lib/Target/X86/X86ISelLowering.h (+2) 
- (added) llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll (+1231) 
- (added) llvm/test/CodeGen/X86/vector-mul-i8-negative.ll (+466) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8ad64a852b74d..7594b487b9666 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2538,6 +2538,23 @@ class LLVM_ABI TargetLoweringBase {
     return false;
   }
 
+  /// Structure to hold detailed decomposition of multiply by constant.
+  struct MulByConstInfo {
+    bool IsDecomposable = false;
+    bool Negate = false;    // True if result should be negated
+    unsigned NumShifts = 0; // 1 or 2
+    unsigned Shift1 = 0;    // Primary shift amount
+    unsigned Shift2 = 0;    // Secondary shift amount (for 2-shift case)
+    bool IsSub = false;     // True for SUB, false for ADD (for 2-shift case)
+  };
+
+  /// Get detailed decomposition of multiply by constant if available.
+  /// Returns decomposition info if the target has a custom decomposition
+  /// for this multiply-by-constant, otherwise returns IsDecomposable = false.
+  virtual MulByConstInfo getMulByConstInfo(EVT VT, const APInt &C) const {
+    return MulByConstInfo();
+  }
+
   /// Return true if it may be profitable to transform
   /// (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
   /// This may not be true if c1 and c2 can be represented as immediates but
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 74d00317c3649..26ff36a768167 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4836,6 +4836,30 @@ template <class MatchContextClass> SDValue DAGCombiner::visitMUL(SDNode *N) {
   //           x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16)
   if (!UseVP && N1IsConst &&
       TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
+    // First check if target has custom decomposition info
+    TargetLowering::MulByConstInfo Info =
+        TLI.getMulByConstInfo(VT, ConstValue1);
+    if (Info.IsDecomposable) {
+      // Emit custom decomposition based on target's info
+      SDValue Result;
+      if (Info.NumShifts == 1) {
+        // Single shift: result = N0 << Shift1
+        Result = DAG.getNode(ISD::SHL, DL, VT, N0,
+                             DAG.getConstant(Info.Shift1, DL, VT));
+      } else if (Info.NumShifts == 2) {
+        // Two shifts with add or sub
+        SDValue Shl1 = DAG.getNode(ISD::SHL, DL, VT, N0,
+                                   DAG.getConstant(Info.Shift1, DL, VT));
+        SDValue Shl2 = DAG.getNode(ISD::SHL, DL, VT, N0,
+                                   DAG.getConstant(Info.Shift2, DL, VT));
+        Result =
+            DAG.getNode(Info.IsSub ? ISD::SUB : ISD::ADD, DL, VT, Shl1, Shl2);
+      }
+      if (Info.Negate)
+        Result = DAG.getNegative(Result, DL, VT);
+      return Result;
+    }
+
     // TODO: We could handle more general decomposition of any constant by
     //       having the target set a limit on number of ops and making a
     //       callback to determine that sequence (similar to sqrt expansion).
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 20136ade7c317..f75ce5a53188f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3389,6 +3389,12 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
   if (!ISD::isConstantSplatVector(C.getNode(), MulC))
     return false;
 
+  // Check if this is an 8-bit vector multiply that can be decomposed to shifts.
+  if (VT.isVector() && VT.getScalarSizeInBits() == 8) {
+    if (getMulByConstInfo(VT, MulC).IsDecomposable)
+      return true;
+  }
+
   // Find the type this will be legalized too. Otherwise we might prematurely
   // convert this to shl+add/sub and then still have to type legalize those ops.
   // Another choice would be to defer the decision for illegal types until
@@ -3413,6 +3419,59 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
 }
 
+TargetLowering::MulByConstInfo
+X86TargetLowering::getMulByConstInfo(EVT VT, const APInt &Constant) const {
+  // Only handle 8-bit vector multiplies
+  if (!VT.isVector() || VT.getScalarSizeInBits() != 8 ||
+      Constant.getBitWidth() < 8)
+    return MulByConstInfo();
+
+  TargetLowering::MulByConstInfo Info;
+  int8_t SignedC = static_cast<int8_t>(Constant.getZExtValue());
+  Info.Negate = SignedC < 0;
+
+  uint32_t U = static_cast<uint8_t>(Info.Negate ? -SignedC : SignedC);
+  if (U == 0 || U == 1)
+    return Info;
+
+  // Power of 2.
+  if (isPowerOf2_32(U)) {
+    Info.Shift1 = llvm::countr_zero(U);
+    Info.NumShifts = 1;
+    Info.IsDecomposable = true;
+    return Info;
+  }
+
+  // Decomposition logic:
+  //   m = 2^x + 2^y  => (shl x, x) + (shl x, y)
+  //   m = 2^x - 2^y  => (shl x, x) - (shl x, y)
+  // where 2^y is the lowest set bit.
+  uint32_t LowBit = U & (0U - U);
+  unsigned Shift2 = llvm::countr_zero(LowBit);
+
+  uint32_t Rem = U - LowBit;
+  if (isPowerOf2_32(Rem)) {
+    Info.Shift1 = llvm::countr_zero(Rem);
+    Info.Shift2 = Shift2;
+    Info.IsSub = false;
+    Info.NumShifts = 2;
+    Info.IsDecomposable = true;
+    return Info;
+  }
+
+  uint32_t Sum = U + LowBit;
+  if (Sum <= 0xFF && isPowerOf2_32(Sum)) {
+    Info.Shift1 = llvm::countr_zero(Sum);
+    Info.Shift2 = Shift2;
+    Info.IsSub = true;
+    Info.NumShifts = 2;
+    Info.IsDecomposable = true;
+    return Info;
+  }
+
+  return Info;
+}
+
 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                                 unsigned Index) const {
   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index a528c311975d8..24372598aaf53 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1537,6 +1537,8 @@ namespace llvm {
     bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
                                 SDValue C) const override;
 
+    MulByConstInfo getMulByConstInfo(EVT VT, const APInt &C) const override;
+
     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
     /// with this index.
     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
diff --git a/llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll b/llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll
new file mode 100644
index 0000000000000..9648352ebc2a9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll
@@ -0,0 +1,1231 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-tuning-fast-imm-vector-shift | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-tuning-fast-imm-vector-shift,+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-tuning-fast-imm-vector-shift,+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512
+
+;; Tests vXi8 constant-multiply decomposition into shift/add/sub sequences.
+;;
+;; Examples:
+;;   6 = 2^2 + 2^1 = 4 + 2  (or 8 - 2)
+;;   10 = 2^3 + 2^1 = 8 + 2
+;;   12 = 2^3 + 2^2 = 8 + 4  (or 16 - 4)
+;;   18 = 2^4 + 2^1 = 16 + 2
+;;   20 = 2^4 + 2^2 = 16 + 4
+;;   24 = 2^4 + 2^3 = 16 + 8  (or 32 - 8)
+;;
+;; To run this test:
+;;   llvm-lit llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll
+;;
+;; To regenerate CHECK lines:
+;;   python llvm/utils/update_llc_test_checks.py llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll
+
+;; ============================================================================
+;; v16i8 Tests (128-bit vectors) - Sum of two powers of 2
+;; ============================================================================
+
+define <16 x i8> @mul_v16i8_const6(<16 x i8> %a) nounwind {
+; Test multiply by 6 = 4 + 2 = (1 << 2) + (1 << 1)
+; SSE2-LABEL: mul_v16i8_const6:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: mul_v16i8_const6:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v16i8_const6:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %result = mul <16 x i8> %a, <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const10(<16 x i8> %a) nounwind {
+; Test multiply by 10 = 8 + 2 = (1 << 3) + (1 << 1)
+; SSE2-LABEL: mul_v16i8_const10:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    psllw $3, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: mul_v16i8_const10:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; AVX2-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v16i8_const10:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %result = mul <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const12(<16 x i8> %a) nounwind {
+; Test multiply by 12 = 8 + 4 = (1 << 3) + (1 << 2)
+; SSE2-LABEL: mul_v16i8_const12:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psllw $3, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: mul_v16i8_const12:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v16i8_const12:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %result = mul <16 x i8> %a, <i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const18(<16 x i8> %a) nounwind {
+; Test multiply by 18 = 16 + 2 = (1 << 4) + (1 << 1)
+; SSE2-LABEL: mul_v16i8_const18:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: mul_v16i8_const18:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v16i8_const18:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %result = mul <16 x i8> %a, <i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const20(<16 x i8> %a) nounwind {
+; Test multiply by 20 = 16 + 4 = (1 << 4) + (1 << 2)
+; SSE2-LABEL: mul_v16i8_const20:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: mul_v16i8_const20:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v16i8_const20:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %result = mul <16 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const24(<16 x i8> %a) nounwind {
+; Test multiply by 24 = 16 + 8 = (1 << 4) + (1 << 3)
+; SSE2-LABEL: mul_v16i8_const24:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $3, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psllw $4, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: mul_v16i8_const24:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $3, %xmm0, %xmm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v16i8_const24:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %result = mul <16 x i8> %a, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const34(<16 x i8> %a) nounwind {
+; Test multiply by 34 = 32 + 2 = (1 << 5) + (1 << 1)
+; SSE2-LABEL: mul_v16i8_const34:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    psllw $5, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: mul_v16i8_const34:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; AVX2-NEXT:    vpsllw $5, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v16i8_const34:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vpsllw $5, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %result = mul <16 x i8> %a, <i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const36(<16 x i8> %a) nounwind {
+; Test multiply by 36 = 32 + 4 = (1 << 5) + (1 << 2)
+; SSE2-LABEL: mul_v16i8_const36:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $2, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psllw $5, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: mul_v16i8_const36:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $5, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v16i8_const36:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $5, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %result = mul <16 x i8> %a, <i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const40(<16 x i8> %a) nounwind {
+; Test multiply by 40 = 32 + 8 = (1 << 5) + (1 << 3)
+; SSE2-LABEL: mul_v16i8_const40:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $3, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psllw $5, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: mul_v16i8_const40:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $3, %xmm0, %xmm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $5, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v16i8_const40:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $5, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %result = mul <16 x i8> %a, <i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40>
+  ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const48(<16 x i8> %a) nounwind {
+; Test multiply by 48 = 32 + 16 = (1 << 5) + (1 << 4)
+; SSE2-LABEL: mul_v16i8_const48:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psllw $4, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psllw $5, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: mul_v16i8_const48:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $5, %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: mul_v16i8_const48:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $5, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+  %result = mul <16 x i8> %a, <i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48>
+  ret <16 x i...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/174110