[llvm] Use shift+add/sub for vXi8 splat multiplies #164200 (PR #174110)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 31 12:48:22 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Cody Cutler (grodranlorth)
<details>
<summary>Changes</summary>
Issue #<!-- -->164200
I will create a separate PR to the `llvm-test-suite` repo for the microbenchmark for this change.
In my experiments on an EC2 `c6i.4xl`, the change gives a small improvement for the `x86-64`, `x86-64-v2`, and `x86-64-v3` targets. It regresses performance on `x86-64-v4` (in particular, when the constant decomposes into two shifts). The performance summary follows:
```
$ ../MicroBenchmarks/libs/benchmark/tools/compare.py benchmarks results-baseline-generic-v1.json results-opt-generic-v1.json |tail -n1
OVERALL_GEOMEAN -0.2846 -0.2846 0 0 0 0
$ ../MicroBenchmarks/libs/benchmark/tools/compare.py benchmarks results-baseline-generic-v2.json results-opt-generic-v2.json |tail -n1
OVERALL_GEOMEAN -0.0907 -0.0907 0 0 0 0
$ ../MicroBenchmarks/libs/benchmark/tools/compare.py benchmarks results-baseline-generic-v3.json results-opt-generic-v3.json |tail -n1
OVERALL_GEOMEAN -0.1821 -0.1821 0 0 0 0
$ ../MicroBenchmarks/libs/benchmark/tools/compare.py benchmarks results-baseline-generic-v4.json results-opt-generic-v4.json |tail -n1
OVERALL_GEOMEAN +0.0190 +0.0190 0 0 0 0
```
---
Patch is 81.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174110.diff
6 Files Affected:
- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+17)
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+24)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+59)
- (modified) llvm/lib/Target/X86/X86ISelLowering.h (+2)
- (added) llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll (+1231)
- (added) llvm/test/CodeGen/X86/vector-mul-i8-negative.ll (+466)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8ad64a852b74d..7594b487b9666 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2538,6 +2538,23 @@ class LLVM_ABI TargetLoweringBase {
return false;
}
+ /// Structure to hold detailed decomposition of multiply by constant.
+ struct MulByConstInfo {
+ bool IsDecomposable = false;
+ bool Negate = false; // True if result should be negated
+ unsigned NumShifts = 0; // 1 or 2
+ unsigned Shift1 = 0; // Primary shift amount
+ unsigned Shift2 = 0; // Secondary shift amount (for 2-shift case)
+ bool IsSub = false; // True for SUB, false for ADD (for 2-shift case)
+ };
+
+ /// Get detailed decomposition of multiply by constant if available.
+ /// Returns decomposition info if the target has a custom decomposition
+ /// for this multiply-by-constant, otherwise returns IsDecomposable = false.
+ virtual MulByConstInfo getMulByConstInfo(EVT VT, const APInt &C) const {
+ return MulByConstInfo();
+ }
+
/// Return true if it may be profitable to transform
/// (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
/// This may not be true if c1 and c2 can be represented as immediates but
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 74d00317c3649..26ff36a768167 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4836,6 +4836,30 @@ template <class MatchContextClass> SDValue DAGCombiner::visitMUL(SDNode *N) {
// x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16)
if (!UseVP && N1IsConst &&
TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
+ // First check if target has custom decomposition info
+ TargetLowering::MulByConstInfo Info =
+ TLI.getMulByConstInfo(VT, ConstValue1);
+ if (Info.IsDecomposable) {
+ // Emit custom decomposition based on target's info
+ SDValue Result;
+ if (Info.NumShifts == 1) {
+ // Single shift: result = N0 << Shift1
+ Result = DAG.getNode(ISD::SHL, DL, VT, N0,
+ DAG.getConstant(Info.Shift1, DL, VT));
+ } else if (Info.NumShifts == 2) {
+ // Two shifts with add or sub
+ SDValue Shl1 = DAG.getNode(ISD::SHL, DL, VT, N0,
+ DAG.getConstant(Info.Shift1, DL, VT));
+ SDValue Shl2 = DAG.getNode(ISD::SHL, DL, VT, N0,
+ DAG.getConstant(Info.Shift2, DL, VT));
+ Result =
+ DAG.getNode(Info.IsSub ? ISD::SUB : ISD::ADD, DL, VT, Shl1, Shl2);
+ }
+ if (Info.Negate)
+ Result = DAG.getNegative(Result, DL, VT);
+ return Result;
+ }
+
// TODO: We could handle more general decomposition of any constant by
// having the target set a limit on number of ops and making a
// callback to determine that sequence (similar to sqrt expansion).
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 20136ade7c317..f75ce5a53188f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3389,6 +3389,12 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
if (!ISD::isConstantSplatVector(C.getNode(), MulC))
return false;
+ // Check if this is an 8-bit vector multiply that can be decomposed to shifts.
+ if (VT.isVector() && VT.getScalarSizeInBits() == 8) {
+ if (getMulByConstInfo(VT, MulC).IsDecomposable)
+ return true;
+ }
+
// Find the type this will be legalized too. Otherwise we might prematurely
// convert this to shl+add/sub and then still have to type legalize those ops.
// Another choice would be to defer the decision for illegal types until
@@ -3413,6 +3419,59 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
}
+TargetLowering::MulByConstInfo
+X86TargetLowering::getMulByConstInfo(EVT VT, const APInt &Constant) const {
+ // Only handle 8-bit vector multiplies
+ if (!VT.isVector() || VT.getScalarSizeInBits() != 8 ||
+ Constant.getBitWidth() < 8)
+ return MulByConstInfo();
+
+ TargetLowering::MulByConstInfo Info;
+ int8_t SignedC = static_cast<int8_t>(Constant.getZExtValue());
+ Info.Negate = SignedC < 0;
+
+ uint32_t U = static_cast<uint8_t>(Info.Negate ? -SignedC : SignedC);
+ if (U == 0 || U == 1)
+ return Info;
+
+ // Power of 2.
+ if (isPowerOf2_32(U)) {
+ Info.Shift1 = llvm::countr_zero(U);
+ Info.NumShifts = 1;
+ Info.IsDecomposable = true;
+ return Info;
+ }
+
+ // Decomposition logic:
+ // m = 2^x + 2^y => (shl x, x) + (shl x, y)
+ // m = 2^x - 2^y => (shl x, x) - (shl x, y)
+ // where 2^y is the lowest set bit.
+ uint32_t LowBit = U & (0U - U);
+ unsigned Shift2 = llvm::countr_zero(LowBit);
+
+ uint32_t Rem = U - LowBit;
+ if (isPowerOf2_32(Rem)) {
+ Info.Shift1 = llvm::countr_zero(Rem);
+ Info.Shift2 = Shift2;
+ Info.IsSub = false;
+ Info.NumShifts = 2;
+ Info.IsDecomposable = true;
+ return Info;
+ }
+
+ uint32_t Sum = U + LowBit;
+ if (Sum <= 0xFF && isPowerOf2_32(Sum)) {
+ Info.Shift1 = llvm::countr_zero(Sum);
+ Info.Shift2 = Shift2;
+ Info.IsSub = true;
+ Info.NumShifts = 2;
+ Info.IsDecomposable = true;
+ return Info;
+ }
+
+ return Info;
+}
+
bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index a528c311975d8..24372598aaf53 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1537,6 +1537,8 @@ namespace llvm {
bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
SDValue C) const override;
+ MulByConstInfo getMulByConstInfo(EVT VT, const APInt &C) const override;
+
/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
/// with this index.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
diff --git a/llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll b/llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll
new file mode 100644
index 0000000000000..9648352ebc2a9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll
@@ -0,0 +1,1231 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-tuning-fast-imm-vector-shift | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-tuning-fast-imm-vector-shift,+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-tuning-fast-imm-vector-shift,+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512
+
+;; Tests vXi8 constant-multiply decomposition into shift/add/sub sequences.
+;;
+;; Examples:
+;; 6 = 2^2 + 2^1 = 4 + 2 (or 8 - 2)
+;; 10 = 2^3 + 2^1 = 8 + 2
+;; 12 = 2^3 + 2^2 = 8 + 4 (or 16 - 4)
+;; 18 = 2^4 + 2^1 = 16 + 2
+;; 20 = 2^4 + 2^2 = 16 + 4
+;; 24 = 2^4 + 2^3 = 16 + 8 (or 32 - 8)
+;;
+;; To run this test:
+;; llvm-lit llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll
+;;
+;; To regenerate CHECK lines:
+;; python llvm/utils/update_llc_test_checks.py llvm/test/CodeGen/X86/vector-mul-i8-decompose.ll
+
+;; ============================================================================
+;; v16i8 Tests (128-bit vectors) - Sum of two powers of 2
+;; ============================================================================
+
+define <16 x i8> @mul_v16i8_const6(<16 x i8> %a) nounwind {
+; Test multiply by 6 = 4 + 2 = (1 << 2) + (1 << 1)
+; SSE2-LABEL: mul_v16i8_const6:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: psllw $2, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: mul_v16i8_const6:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX2-NEXT: vpsllw $2, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v16i8_const6:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX512-NEXT: vpsllw $2, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %result = mul <16 x i8> %a, <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>
+ ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const10(<16 x i8> %a) nounwind {
+; Test multiply by 10 = 8 + 2 = (1 << 3) + (1 << 1)
+; SSE2-LABEL: mul_v16i8_const10:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: psllw $3, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: mul_v16i8_const10:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX2-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v16i8_const10:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %result = mul <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
+ ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const12(<16 x i8> %a) nounwind {
+; Test multiply by 12 = 8 + 4 = (1 << 3) + (1 << 2)
+; SSE2-LABEL: mul_v16i8_const12:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psllw $3, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: mul_v16i8_const12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v16i8_const12:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %result = mul <16 x i8> %a, <i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12>
+ ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const18(<16 x i8> %a) nounwind {
+; Test multiply by 18 = 16 + 2 = (1 << 4) + (1 << 1)
+; SSE2-LABEL: mul_v16i8_const18:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: mul_v16i8_const18:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v16i8_const18:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %result = mul <16 x i8> %a, <i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18, i8 18>
+ ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const20(<16 x i8> %a) nounwind {
+; Test multiply by 20 = 16 + 4 = (1 << 4) + (1 << 2)
+; SSE2-LABEL: mul_v16i8_const20:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: mul_v16i8_const20:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v16i8_const20:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %result = mul <16 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20>
+ ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const24(<16 x i8> %a) nounwind {
+; Test multiply by 24 = 16 + 8 = (1 << 4) + (1 << 3)
+; SSE2-LABEL: mul_v16i8_const24:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $3, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psllw $4, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: mul_v16i8_const24:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v16i8_const24:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %result = mul <16 x i8> %a, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+ ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const34(<16 x i8> %a) nounwind {
+; Test multiply by 34 = 32 + 2 = (1 << 5) + (1 << 1)
+; SSE2-LABEL: mul_v16i8_const34:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: psllw $5, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: mul_v16i8_const34:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v16i8_const34:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %result = mul <16 x i8> %a, <i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34, i8 34>
+ ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const36(<16 x i8> %a) nounwind {
+; Test multiply by 36 = 32 + 4 = (1 << 5) + (1 << 2)
+; SSE2-LABEL: mul_v16i8_const36:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $2, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psllw $5, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: mul_v16i8_const36:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v16i8_const36:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllw $2, %xmm0, %xmm1
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %result = mul <16 x i8> %a, <i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36, i8 36>
+ ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const40(<16 x i8> %a) nounwind {
+; Test multiply by 40 = 32 + 8 = (1 << 5) + (1 << 3)
+; SSE2-LABEL: mul_v16i8_const40:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $3, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psllw $5, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: mul_v16i8_const40:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v16i8_const40:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllw $3, %xmm0, %xmm1
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %result = mul <16 x i8> %a, <i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40, i8 40>
+ ret <16 x i8> %result
+}
+
+define <16 x i8> @mul_v16i8_const48(<16 x i8> %a) nounwind {
+; Test multiply by 48 = 32 + 16 = (1 << 5) + (1 << 4)
+; SSE2-LABEL: mul_v16i8_const48:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psllw $4, %xmm1
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: psllw $5, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: mul_v16i8_const48:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v16i8_const48:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %result = mul <16 x i8> %a, <i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48, i8 48>
+ ret <16 x i...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/174110
More information about the llvm-commits
mailing list