[llvm] b6808ba - [X86] Make constant `mul` -> `shl` + `add`/`sub` work for vector types
Noah Goldstein via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 10 12:39:08 PDT 2023
Author: Noah Goldstein
Date: 2023-06-10T14:38:46-05:00
New Revision: b6808ba291fcb5ff2e0d651e9e710f3d75631bc4
URL: https://github.com/llvm/llvm-project/commit/b6808ba291fcb5ff2e0d651e9e710f3d75631bc4
DIFF: https://github.com/llvm/llvm-project/commit/b6808ba291fcb5ff2e0d651e9e710f3d75631bc4.diff
LOG: [X86] Make constant `mul` -> `shl` + `add`/`sub` work for vector types
Something like:
`%r = mul %x, <33, 33, 33, ...>`
Is best lowered as:
`%tmp = %shl x, <5, 5, 5>; %r = add %tmp, %x`
As well, since vectors have non-destructive shifts, we can also do
cases where the multiply constant is `Pow2A +/- Pow2B` for arbitrary A
and B, unlike in the scalar case where the extra `mov` instructions
make it not worth it.
Reviewed By: pengfei
Differential Revision: https://reviews.llvm.org/D150324
Added:
Modified:
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86TargetTransformInfo.h
llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
llvm/test/CodeGen/X86/combine-add.ll
llvm/test/CodeGen/X86/rotate-extract-vector.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
llvm/test/CodeGen/X86/vector-mul.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index e9f9f1bb142b1..1cd13cce456fc 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -562,6 +562,10 @@ def TuningPreferShiftShuffle : SubtargetFeature<"faster-shift-than-shuffle",
"PreferLowerShuffleAsShift", "true",
"Shifts are faster (or as fast) as shuffle">;
+def TuningFastImmVectorShift : SubtargetFeature<"tuning-fast-imm-vector-shift",
+ "FastImmVectorShift", "true",
+ "Vector shifts are fast (2/cycle) as opposed to slow (1/cycle)">;
+
// On some X86 processors, a vzeroupper instruction should be inserted after
// using ymm/zmm registers before executing code that may use SSE instructions.
def TuningInsertVZEROUPPER
@@ -939,7 +943,8 @@ def ProcessorFeatures {
TuningPreferShiftShuffle,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
- TuningNoDomainDelayBlend];
+ TuningNoDomainDelayBlend,
+ TuningFastImmVectorShift];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);
@@ -980,7 +985,8 @@ def ProcessorFeatures {
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
- TuningNoDomainDelayBlend];
+ TuningNoDomainDelayBlend,
+ TuningFastImmVectorShift];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);
@@ -1008,7 +1014,8 @@ def ProcessorFeatures {
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
- TuningNoDomainDelayBlend];
+ TuningNoDomainDelayBlend,
+ TuningFastImmVectorShift];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);
@@ -1170,7 +1177,8 @@ def ProcessorFeatures {
FeatureMOVDIR64B,
FeatureWAITPKG];
list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps,
- TuningPreferMovmskOverVTest];
+ TuningPreferMovmskOverVTest,
+ TuningFastImmVectorShift];
list<SubtargetFeature> ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning);
list<SubtargetFeature> ADLFeatures =
!listconcat(TRMFeatures, ADLAdditionalFeatures);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6be8d8223aa30..7197fc33fdb5c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48722,12 +48722,25 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
- if (VT != MVT::i64 && VT != MVT::i32)
+ if (VT != MVT::i64 && VT != MVT::i32 &&
+ (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
return SDValue();
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!C)
- return SDValue();
+ ConstantSDNode *CNode = isConstOrConstSplat(
+ N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
+ const APInt *C = nullptr;
+ if (!CNode) {
+ if (VT.isVector())
+ if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
+ if (auto *SplatC = RawC->getSplatValue())
+ C = &(SplatC->getUniqueInteger());
+
+ if (!C)
+ return SDValue();
+ } else {
+ C = &(CNode->getAPIntValue());
+ }
+
if (isPowerOf2_64(C->getZExtValue()))
return SDValue();
@@ -48736,68 +48749,69 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
SDLoc DL(N);
- if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
- SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
- DAG.getConstant(AbsMulAmt, DL, VT));
- if (SignMulAmt < 0)
- NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- NewMul);
-
- return NewMul;
- }
-
- uint64_t MulAmt1 = 0;
- uint64_t MulAmt2 = 0;
- if ((AbsMulAmt % 9) == 0) {
- MulAmt1 = 9;
- MulAmt2 = AbsMulAmt / 9;
- } else if ((AbsMulAmt % 5) == 0) {
- MulAmt1 = 5;
- MulAmt2 = AbsMulAmt / 5;
- } else if ((AbsMulAmt % 3) == 0) {
- MulAmt1 = 3;
- MulAmt2 = AbsMulAmt / 3;
- }
-
- SDValue NewMul;
- // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
- if (MulAmt2 &&
- (isPowerOf2_64(MulAmt2) ||
- (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
-
- if (isPowerOf2_64(MulAmt2) &&
- !(SignMulAmt >= 0 && N->hasOneUse() &&
- N->use_begin()->getOpcode() == ISD::ADD))
- // If second multiplifer is pow2, issue it first. We want the multiply by
- // 3, 5, or 9 to be folded into the addressing mode unless the lone use
- // is an add. Only do this for positive multiply amounts since the
- // negate would prevent it from being used as an address mode anyway.
- std::swap(MulAmt1, MulAmt2);
-
- if (isPowerOf2_64(MulAmt1))
- NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
- else
+ SDValue NewMul = SDValue();
+ if (VT == MVT::i64 || VT == MVT::i32) {
+ if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
- DAG.getConstant(MulAmt1, DL, VT));
-
- if (isPowerOf2_64(MulAmt2))
- NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
- DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
- else
- NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
- DAG.getConstant(MulAmt2, DL, VT));
+ DAG.getConstant(AbsMulAmt, DL, VT));
+ if (SignMulAmt < 0)
+ NewMul =
+ DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+
+ return NewMul;
+ }
+
+ uint64_t MulAmt1 = 0;
+ uint64_t MulAmt2 = 0;
+ if ((AbsMulAmt % 9) == 0) {
+ MulAmt1 = 9;
+ MulAmt2 = AbsMulAmt / 9;
+ } else if ((AbsMulAmt % 5) == 0) {
+ MulAmt1 = 5;
+ MulAmt2 = AbsMulAmt / 5;
+ } else if ((AbsMulAmt % 3) == 0) {
+ MulAmt1 = 3;
+ MulAmt2 = AbsMulAmt / 3;
+ }
+
+ // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
+ if (MulAmt2 &&
+ (isPowerOf2_64(MulAmt2) ||
+ (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
+
+ if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
+ N->use_begin()->getOpcode() == ISD::ADD))
+ // If second multiplifer is pow2, issue it first. We want the multiply
+ // by 3, 5, or 9 to be folded into the addressing mode unless the lone
+ // use is an add. Only do this for positive multiply amounts since the
+ // negate would prevent it from being used as an address mode anyway.
+ std::swap(MulAmt1, MulAmt2);
+
+ if (isPowerOf2_64(MulAmt1))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(MulAmt1, DL, VT));
- // Negate the result.
- if (SignMulAmt < 0)
- NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- NewMul);
- } else if (!Subtarget.slowLEA())
- NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
+ if (isPowerOf2_64(MulAmt2))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+ DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+ DAG.getConstant(MulAmt2, DL, VT));
+ // Negate the result.
+ if (SignMulAmt < 0)
+ NewMul =
+ DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+ } else if (!Subtarget.slowLEA())
+ NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
+ }
if (!NewMul) {
+ EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
assert(C->getZExtValue() != 0 &&
- C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
+ C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
"Both cases that could cause potential overflows should have "
"already been handled.");
if (isPowerOf2_64(AbsMulAmt - 1)) {
@@ -48805,38 +48819,61 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
NewMul = DAG.getNode(
ISD::ADD, DL, VT, N->getOperand(0),
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
- MVT::i8)));
+ DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
// To negate, subtract the number from zero
if (SignMulAmt < 0)
- NewMul = DAG.getNode(ISD::SUB, DL, VT,
- DAG.getConstant(0, DL, VT), NewMul);
+ NewMul =
+ DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
} else if (isPowerOf2_64(AbsMulAmt + 1)) {
// (mul x, 2^N - 1) => (sub (shl x, N), x)
- NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(AbsMulAmt + 1),
- DL, MVT::i8));
+ NewMul =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
// To negate, reverse the operands of the subtract.
if (SignMulAmt < 0)
NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
else
NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
- } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
+ } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
+ (!VT.isVector() || Subtarget.fastImmVectorShift())) {
// (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
- NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(AbsMulAmt - 2),
- DL, MVT::i8));
+ NewMul =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
NewMul = DAG.getNode(
ISD::ADD, DL, VT, NewMul,
DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
- } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
+ } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
+ (!VT.isVector() || Subtarget.fastImmVectorShift())) {
// (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
- NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(AbsMulAmt + 2),
- DL, MVT::i8));
+ NewMul =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
NewMul = DAG.getNode(
ISD::SUB, DL, VT, NewMul,
DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
+ } else if (SignMulAmt >= 0 && VT.isVector() &&
+ Subtarget.fastImmVectorShift()) {
+ uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
+ uint64_t ShiftAmt1;
+ std::optional<unsigned> Opc;
+ if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
+ ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
+ Opc = ISD::ADD;
+ } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
+ ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
+ Opc = ISD::SUB;
+ }
+
+ if (Opc) {
+ SDValue Shift1 =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
+ SDValue Shift2 =
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
+ NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
+ }
}
}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 857d95eb65839..0e1a475b2af66 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -93,6 +93,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::TuningNoDomainDelayShuffle,
X86::TuningNoDomainDelayBlend,
X86::TuningPreferShiftShuffle,
+ X86::TuningFastImmVectorShift,
// Perf-tuning flags.
X86::TuningFastGather,
diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
index 470dfdfccfc41..8879f04687ba2 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
@@ -210,12 +210,13 @@ define void @bcast_unfold_mul_v16i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_mul_v16i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
-; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB6_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %zmm0, %zmm1
-; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax)
+; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm1
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vmovdqu64 %zmm0, 4096(%rdi,%rax)
; CHECK-NEXT: addq $64, %rax
; CHECK-NEXT: jne .LBB6_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -244,12 +245,13 @@ define void @bcast_unfold_mul_v8i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_mul_v8i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
-; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB7_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %ymm0, %ymm1
-; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax)
+; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm1
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vmovdqu %ymm0, 4096(%rdi,%rax)
; CHECK-NEXT: addq $32, %rax
; CHECK-NEXT: jne .LBB7_1
; CHECK-NEXT: # %bb.2: # %bb10
@@ -278,12 +280,13 @@ define void @bcast_unfold_mul_v4i32(i32* %arg) {
; CHECK-LABEL: bcast_unfold_mul_v4i32:
; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000
-; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,3,3,3]
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB8_1: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %xmm0, %xmm1
-; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax)
+; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovdqu %xmm0, 4096(%rdi,%rax)
; CHECK-NEXT: addq $16, %rax
; CHECK-NEXT: jne .LBB8_1
; CHECK-NEXT: # %bb.2: # %bb10
diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll
index 72b987d452f5c..182c4ca29905b 100644
--- a/llvm/test/CodeGen/X86/combine-add.ll
+++ b/llvm/test/CodeGen/X86/combine-add.ll
@@ -234,13 +234,16 @@ define void @PR52039(ptr %pa, ptr %pb) {
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: psubd %xmm1, %xmm3
; SSE-NEXT: psubd %xmm0, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [3,3,3,3]
-; SSE-NEXT: movdqu %xmm2, (%rsi)
-; SSE-NEXT: pmulld %xmm0, %xmm2
-; SSE-NEXT: pmulld %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: movdqu %xmm3, 16(%rsi)
-; SSE-NEXT: movdqu %xmm0, 16(%rdi)
-; SSE-NEXT: movdqu %xmm2, (%rdi)
+; SSE-NEXT: movdqu %xmm2, (%rsi)
+; SSE-NEXT: movdqu %xmm1, 16(%rdi)
+; SSE-NEXT: movdqu %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: PR52039:
diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
index 58d9d7e6952f3..4c5a3c12fa385 100644
--- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
@@ -103,14 +103,16 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
; X86-LABEL: vrolw_extract_mul_with_mask:
; X86: # %bb.0:
-; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
+; X86-NEXT: vpslld $3, %xmm0, %xmm1
+; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; X86-NEXT: vprold $7, %xmm0, %xmm0
; X86-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: vrolw_extract_mul_with_mask:
; X64: # %bb.0:
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; X64-NEXT: vpslld $3, %xmm0, %xmm1
+; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; X64-NEXT: vprold $7, %xmm0, %xmm0
; X64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; X64-NEXT: retq
@@ -194,7 +196,8 @@ define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
; X86-LABEL: no_extract_mul:
; X86: # %bb.0:
; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1
-; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
+; X86-NEXT: vpslld $3, %ymm0, %ymm2
+; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; X86-NEXT: vpsrld $23, %ymm0, %ymm0
; X86-NEXT: vpor %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
@@ -202,7 +205,8 @@ define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
; X64-LABEL: no_extract_mul:
; X64: # %bb.0:
; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
-; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; X64-NEXT: vpslld $3, %ymm0, %ymm2
+; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; X64-NEXT: vpsrld $23, %ymm0, %ymm0
; X64-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 53e450856632d..beadd93ac6e54 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -487,8 +487,10 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: psrld $31, %xmm1
; SSE41-NEXT: psrad $2, %xmm2
; SSE41-NEXT: paddd %xmm1, %xmm2
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: psubd %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: pslld $3, %xmm1
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: paddd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_4i32:
@@ -503,8 +505,9 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpslld $3, %xmm1, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_4i32:
@@ -519,9 +522,9 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1
; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
-; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpslld $3, %xmm1, %xmm2
+; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
ret <4 x i32> %res
@@ -536,8 +539,10 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
; SSE-NEXT: psrlw $15, %xmm2
; SSE-NEXT: psraw $1, %xmm1
; SSE-NEXT: paddw %xmm2, %xmm1
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psllw $3, %xmm2
+; SSE-NEXT: psubw %xmm2, %xmm1
+; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_rem7_8i16:
@@ -546,8 +551,9 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2
; AVX-NEXT: vpsraw $1, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $3, %xmm1, %xmm2
+; AVX-NEXT: vpsubw %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <8 x i16> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 279f3c464411c..95d035b2c3ae4 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -497,9 +497,9 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2
; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
-; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpslld $3, %ymm1, %ymm2
+; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%res = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <8 x i32> %res
@@ -533,8 +533,9 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2
; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1
; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $3, %ymm1, %ymm2
+; AVX2-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%res = srem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <16 x i16> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 04761b6d35c5f..ede1c82ff5b82 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -400,8 +400,9 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
; AVX-NEXT: vpsrld $31, %zmm1, %zmm2
; AVX-NEXT: vpsrad $2, %zmm1, %zmm1
; AVX-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; AVX-NEXT: vpslld $3, %zmm1, %zmm2
+; AVX-NEXT: vpsubd %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX-NEXT: retq
%res = srem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <16 x i32> %res
@@ -435,8 +436,9 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512BW-NEXT: vpsrlw $15, %zmm1, %zmm2
; AVX512BW-NEXT: vpsraw $1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%res = srem <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <32 x i16> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 8f9f93a7b5dc5..db4b83a782e1c 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -558,8 +558,10 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: psrld $1, %xmm1
; SSE41-NEXT: paddd %xmm2, %xmm1
; SSE41-NEXT: psrld $2, %xmm1
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pslld $3, %xmm2
+; SSE41-NEXT: psubd %xmm2, %xmm1
+; SSE41-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_4i32:
@@ -574,8 +576,9 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpslld $3, %xmm1, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_4i32:
@@ -590,9 +593,9 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
-; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpslld $3, %xmm1, %xmm2
+; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%res = urem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
ret <4 x i32> %res
@@ -608,8 +611,10 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
; SSE-NEXT: psrlw $1, %xmm2
; SSE-NEXT: paddw %xmm1, %xmm2
; SSE-NEXT: psrlw $2, %xmm2
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: psubw %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: psllw $3, %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm2
+; SSE-NEXT: paddw %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_rem7_8i16:
@@ -619,8 +624,9 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $3, %xmm1, %xmm2
+; AVX-NEXT: vpsubw %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%res = urem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <8 x i16> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 805dd422ac491..c78552cd78e3f 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -528,9 +528,9 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpsrld $2, %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7]
-; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpslld $3, %ymm1, %ymm2
+; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%res = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <8 x i32> %res
@@ -567,8 +567,9 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllw $3, %ymm1, %ymm2
+; AVX2-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%res = urem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <16 x i16> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index 5169dd69f39fc..643eb30c40de0 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -420,8 +420,9 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
; AVX-NEXT: vpsrld $1, %zmm1, %zmm1
; AVX-NEXT: vpaddd %zmm3, %zmm1, %zmm1
; AVX-NEXT: vpsrld $2, %zmm1, %zmm1
-; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
-; AVX-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; AVX-NEXT: vpslld $3, %zmm1, %zmm2
+; AVX-NEXT: vpsubd %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX-NEXT: retq
%res = urem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <16 x i32> %res
@@ -458,8 +459,9 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddw %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%res = urem <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <32 x i16> %res
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 34ab53e552b74..42db9b79b57f2 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -322,33 +322,17 @@ define <2 x i64> @mul_v2i64_17(<2 x i64> %a0) nounwind {
}
define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind {
-; SSE2-LABEL: mul_v4i32_17:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $4, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: ret{{[l|q]}}
-;
-; X86-SSE4-LABEL: mul_v4i32_17:
-; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE4-NEXT: retl
-;
-; X64-SSE4-FAST-LABEL: mul_v4i32_17:
-; X64-SSE4-FAST: # %bb.0:
-; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE4-FAST-NEXT: retq
-;
-; X64-SSE4-SLOW-LABEL: mul_v4i32_17:
-; X64-SSE4-SLOW: # %bb.0:
-; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE4-SLOW-NEXT: pslld $4, %xmm1
-; X64-SSE4-SLOW-NEXT: paddd %xmm1, %xmm0
-; X64-SSE4-SLOW-NEXT: retq
+; SSE-LABEL: mul_v4i32_17:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pslld $4, %xmm1
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
;
; X64-XOP-LABEL: mul_v4i32_17:
; X64-XOP: # %bb.0:
-; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpslld $4, %xmm0, %xmm1
+; X64-XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; X64-XOP-NEXT: retq
;
; X64-AVX2-LABEL: mul_v4i32_17:
@@ -366,19 +350,17 @@ define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind {
}
define <8 x i16> @mul_v8i16_17(<8 x i16> %a0) nounwind {
-; X86-SSE-LABEL: mul_v8i16_17:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT: retl
-;
-; X64-SSE-LABEL: mul_v8i16_17:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT: retq
+; SSE-LABEL: mul_v8i16_17:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psllw $4, %xmm1
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
;
; X64-AVX-LABEL: mul_v8i16_17:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllw $4, %xmm0, %xmm1
+; X64-AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: retq
%1 = mul <8 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
ret <8 x i16> %1
@@ -461,39 +443,15 @@ define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind {
}
define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
-; SSE2-LABEL: mul_v8i32_17:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pslld $4, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pslld $4, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: ret{{[l|q]}}
-;
-; X86-SSE4-LABEL: mul_v8i32_17:
-; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
-; X86-SSE4-NEXT: pmulld %xmm2, %xmm0
-; X86-SSE4-NEXT: pmulld %xmm2, %xmm1
-; X86-SSE4-NEXT: retl
-;
-; X64-SSE4-FAST-LABEL: mul_v8i32_17:
-; X64-SSE4-FAST: # %bb.0:
-; X64-SSE4-FAST-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
-; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm0
-; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm1
-; X64-SSE4-FAST-NEXT: retq
-;
-; X64-SSE4-SLOW-LABEL: mul_v8i32_17:
-; X64-SSE4-SLOW: # %bb.0:
-; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm2
-; X64-SSE4-SLOW-NEXT: pslld $4, %xmm2
-; X64-SSE4-SLOW-NEXT: paddd %xmm2, %xmm0
-; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm2
-; X64-SSE4-SLOW-NEXT: pslld $4, %xmm2
-; X64-SSE4-SLOW-NEXT: paddd %xmm2, %xmm1
-; X64-SSE4-SLOW-NEXT: retq
+; SSE-LABEL: mul_v8i32_17:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pslld $4, %xmm2
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pslld $4, %xmm2
+; SSE-NEXT: paddd %xmm2, %xmm1
+; SSE-NEXT: ret{{[l|q]}}
;
; X64-XOP-LABEL: mul_v8i32_17:
; X64-XOP: # %bb.0:
@@ -522,9 +480,12 @@ define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind {
; SSE-LABEL: mul_v16i16_17:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17]
-; SSE-NEXT: pmullw %xmm2, %xmm0
-; SSE-NEXT: pmullw %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psllw $4, %xmm2
+; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psllw $4, %xmm2
+; SSE-NEXT: paddw %xmm2, %xmm1
; SSE-NEXT: ret{{[l|q]}}
;
; X64-XOP-LABEL: mul_v16i16_17:
@@ -539,12 +500,14 @@ define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind {
;
; X64-AVX2-LABEL: mul_v16i16_17:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
+; X64-AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: retq
;
; X64-AVX512DQ-LABEL: mul_v16i16_17:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1
+; X64-AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; X64-AVX512DQ-NEXT: retq
%1 = mul <16 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
ret <16 x i16> %1
@@ -618,37 +581,21 @@ define <2 x i64> @mul_v2i64_neg1025(<2 x i64> %a0) nounwind {
}
define <4 x i32> @mul_v4i32_neg33(<4 x i32> %a0) nounwind {
-; SSE2-LABEL: mul_v4i32_neg33:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $5, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: ret{{[l|q]}}
-;
-; X86-SSE4-LABEL: mul_v4i32_neg33:
-; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE4-NEXT: retl
-;
-; X64-SSE4-FAST-LABEL: mul_v4i32_neg33:
-; X64-SSE4-FAST: # %bb.0:
-; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE4-FAST-NEXT: retq
-;
-; X64-SSE4-SLOW-LABEL: mul_v4i32_neg33:
-; X64-SSE4-SLOW: # %bb.0:
-; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE4-SLOW-NEXT: pslld $5, %xmm1
-; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm1
-; X64-SSE4-SLOW-NEXT: pxor %xmm0, %xmm0
-; X64-SSE4-SLOW-NEXT: psubd %xmm1, %xmm0
-; X64-SSE4-SLOW-NEXT: retq
+; SSE-LABEL: mul_v4i32_neg33:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pslld $5, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
;
; X64-XOP-LABEL: mul_v4i32_neg33:
; X64-XOP: # %bb.0:
-; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpslld $5, %xmm0, %xmm1
+; X64-XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; X64-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; X64-XOP-NEXT: retq
;
; X64-AVX2-LABEL: mul_v4i32_neg33:
@@ -666,19 +613,21 @@ define <4 x i32> @mul_v4i32_neg33(<4 x i32> %a0) nounwind {
}
define <8 x i16> @mul_v8i16_neg9(<8 x i16> %a0) nounwind {
-; X86-SSE-LABEL: mul_v8i16_neg9:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT: retl
-;
-; X64-SSE-LABEL: mul_v8i16_neg9:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT: retq
+; SSE-LABEL: mul_v8i16_neg9:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psllw $3, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
;
; X64-AVX-LABEL: mul_v8i16_neg9:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm1
+; X64-AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: retq
%1 = mul <8 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>
ret <8 x i16> %1
@@ -783,49 +732,20 @@ define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind {
}
define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
-; SSE2-LABEL: mul_v8i32_neg33:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pslld $5, %xmm3
-; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: psubd %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pslld $5, %xmm3
-; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: psubd %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: ret{{[l|q]}}
-;
-; X86-SSE4-LABEL: mul_v8i32_neg33:
-; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
-; X86-SSE4-NEXT: pmulld %xmm2, %xmm0
-; X86-SSE4-NEXT: pmulld %xmm2, %xmm1
-; X86-SSE4-NEXT: retl
-;
-; X64-SSE4-FAST-LABEL: mul_v8i32_neg33:
-; X64-SSE4-FAST: # %bb.0:
-; X64-SSE4-FAST-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
-; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm0
-; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm1
-; X64-SSE4-FAST-NEXT: retq
-;
-; X64-SSE4-SLOW-LABEL: mul_v8i32_neg33:
-; X64-SSE4-SLOW: # %bb.0:
-; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm3
-; X64-SSE4-SLOW-NEXT: pslld $5, %xmm3
-; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm3
-; X64-SSE4-SLOW-NEXT: pxor %xmm2, %xmm2
-; X64-SSE4-SLOW-NEXT: pxor %xmm0, %xmm0
-; X64-SSE4-SLOW-NEXT: psubd %xmm3, %xmm0
-; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm3
-; X64-SSE4-SLOW-NEXT: pslld $5, %xmm3
-; X64-SSE4-SLOW-NEXT: paddd %xmm1, %xmm3
-; X64-SSE4-SLOW-NEXT: psubd %xmm3, %xmm2
-; X64-SSE4-SLOW-NEXT: movdqa %xmm2, %xmm1
-; X64-SSE4-SLOW-NEXT: retq
+; SSE-LABEL: mul_v8i32_neg33:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pslld $5, %xmm3
+; SSE-NEXT: paddd %xmm0, %xmm3
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: psubd %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pslld $5, %xmm3
+; SSE-NEXT: paddd %xmm1, %xmm3
+; SSE-NEXT: psubd %xmm3, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: ret{{[l|q]}}
;
; X64-XOP-LABEL: mul_v8i32_neg33:
; X64-XOP: # %bb.0:
@@ -857,9 +777,17 @@ define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind {
; SSE-LABEL: mul_v16i16_neg9:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527]
-; SSE-NEXT: pmullw %xmm2, %xmm0
-; SSE-NEXT: pmullw %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psllw $3, %xmm3
+; SSE-NEXT: paddw %xmm0, %xmm3
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: psubw %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psllw $3, %xmm3
+; SSE-NEXT: paddw %xmm1, %xmm3
+; SSE-NEXT: psubw %xmm3, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: ret{{[l|q]}}
;
; X64-XOP-LABEL: mul_v16i16_neg9:
@@ -877,12 +805,18 @@ define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind {
;
; X64-AVX2-LABEL: mul_v16i16_neg9:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsllw $3, %ymm0, %ymm1
+; X64-AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
;
; X64-AVX512DQ-LABEL: mul_v16i16_neg9:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm1
+; X64-AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX512DQ-NEXT: vpsubw %ymm0, %ymm1, %ymm0
; X64-AVX512DQ-NEXT: retq
%1 = mul <16 x i16> %a0, <i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9, i16 -9>
ret <16 x i16> %1
@@ -1162,35 +1096,18 @@ define <2 x i64> @mul_v2i64_7(<2 x i64> %a0) nounwind {
}
define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind {
-; SSE2-LABEL: mul_v4i32_7:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $3, %xmm1
-; SSE2-NEXT: psubd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: ret{{[l|q]}}
-;
-; X86-SSE4-LABEL: mul_v4i32_7:
-; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE4-NEXT: retl
-;
-; X64-SSE4-FAST-LABEL: mul_v4i32_7:
-; X64-SSE4-FAST: # %bb.0:
-; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE4-FAST-NEXT: retq
-;
-; X64-SSE4-SLOW-LABEL: mul_v4i32_7:
-; X64-SSE4-SLOW: # %bb.0:
-; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE4-SLOW-NEXT: pslld $3, %xmm1
-; X64-SSE4-SLOW-NEXT: psubd %xmm0, %xmm1
-; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE4-SLOW-NEXT: retq
+; SSE-LABEL: mul_v4i32_7:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pslld $3, %xmm1
+; SSE-NEXT: psubd %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
;
; X64-XOP-LABEL: mul_v4i32_7:
; X64-XOP: # %bb.0:
-; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpslld $3, %xmm0, %xmm1
+; X64-XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; X64-XOP-NEXT: retq
;
; X64-AVX2-LABEL: mul_v4i32_7:
@@ -1208,19 +1125,18 @@ define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind {
}
define <8 x i16> @mul_v8i16_7(<8 x i16> %a0) nounwind {
-; X86-SSE-LABEL: mul_v8i16_7:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT: retl
-;
-; X64-SSE-LABEL: mul_v8i16_7:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT: retq
+; SSE-LABEL: mul_v8i16_7:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psllw $3, %xmm1
+; SSE-NEXT: psubw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
;
; X64-AVX-LABEL: mul_v8i16_7:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm1
+; X64-AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: retq
%1 = mul <8 x i16> %a0, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <8 x i16> %1
@@ -1290,33 +1206,17 @@ define <2 x i64> @mul_v2i64_neg7(<2 x i64> %a0) nounwind {
}
define <4 x i32> @mul_v4i32_neg63(<4 x i32> %a0) nounwind {
-; SSE2-LABEL: mul_v4i32_neg63:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $6, %xmm1
-; SSE2-NEXT: psubd %xmm1, %xmm0
-; SSE2-NEXT: ret{{[l|q]}}
-;
-; X86-SSE4-LABEL: mul_v4i32_neg63:
-; X86-SSE4: # %bb.0:
-; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE4-NEXT: retl
-;
-; X64-SSE4-FAST-LABEL: mul_v4i32_neg63:
-; X64-SSE4-FAST: # %bb.0:
-; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE4-FAST-NEXT: retq
-;
-; X64-SSE4-SLOW-LABEL: mul_v4i32_neg63:
-; X64-SSE4-SLOW: # %bb.0:
-; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
-; X64-SSE4-SLOW-NEXT: pslld $6, %xmm1
-; X64-SSE4-SLOW-NEXT: psubd %xmm1, %xmm0
-; X64-SSE4-SLOW-NEXT: retq
+; SSE-LABEL: mul_v4i32_neg63:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pslld $6, %xmm1
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
;
; X64-XOP-LABEL: mul_v4i32_neg63:
; X64-XOP: # %bb.0:
-; X64-XOP-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpslld $6, %xmm0, %xmm1
+; X64-XOP-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; X64-XOP-NEXT: retq
;
; X64-AVX2-LABEL: mul_v4i32_neg63:
@@ -1334,19 +1234,17 @@ define <4 x i32> @mul_v4i32_neg63(<4 x i32> %a0) nounwind {
}
define <8 x i16> @mul_v8i16_neg31(<8 x i16> %a0) nounwind {
-; X86-SSE-LABEL: mul_v8i16_neg31:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT: retl
-;
-; X64-SSE-LABEL: mul_v8i16_neg31:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT: retq
+; SSE-LABEL: mul_v8i16_neg31:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psllw $5, %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
;
; X64-AVX-LABEL: mul_v8i16_neg31:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllw $5, %xmm0, %xmm1
+; X64-AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: retq
%1 = mul <8 x i16> %a0, <i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31>
ret <8 x i16> %1
@@ -2090,3 +1988,6 @@ foo:
%e = mul <4 x i64> %b, %d
ret <4 x i64> %e
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64-SSE4-FAST: {{.*}}
+; X64-SSE4-SLOW: {{.*}}
More information about the llvm-commits
mailing list