[llvm] bb42cc2 - [X86] decomposeMulByConstant - decompose legal vXi32 multiplies on SlowPMULLD targets and all vXi64 multiplies
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 2 04:42:24 PDT 2021
Author: Simon Pilgrim
Date: 2021-10-02T12:35:25+01:00
New Revision: bb42cc2090a2316d529b92e1126241c53b85021a
URL: https://github.com/llvm/llvm-project/commit/bb42cc2090a2316d529b92e1126241c53b85021a
DIFF: https://github.com/llvm/llvm-project/commit/bb42cc2090a2316d529b92e1126241c53b85021a.diff
LOG: [X86] decomposeMulByConstant - decompose legal vXi32 multiplies on SlowPMULLD targets and all vXi64 multiplies
X86's decomposeMulByConstant never permits mul decomposition to shift+add/sub if the vector multiply is legal.
Unfortunately this isn't great for SSE41+ targets which have PMULLD for vXi32 multiplies, but is often quite slow. This patch proposes to allow decomposition if the target has the SlowPMULLD flag (i.e. Silvermont). We also always decompose legal vXi64 multiplies - even latest IceLake has really poor latencies for PMULLQ.
Differential Revision: https://reviews.llvm.org/D110588
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-mul.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7a496f9a439e1..59baeb792a966 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5533,10 +5533,13 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
VT = getTypeToTransformTo(Context, VT);
// If vector multiply is legal, assume that's faster than shl + add/sub.
- // TODO: Multiply is a complex op with higher latency and lower throughput in
- // most implementations, so this check could be loosened based on type
- // and/or a CPU attribute.
- if (isOperationLegal(ISD::MUL, VT))
+ // Multiply is a complex op with higher latency and lower throughput in
+ // most implementations, sub-vXi32 vector multiplies are always fast,
+ // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
+ // is always going to be slow.
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
+ (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
return false;
// shl+add, shl+sub, shl+add+neg
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 7dd5b4c960b97..91400daa0919d 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X86-SSE,X86-SSE2
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X86-SSE,X86-SSE4
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,X64-SSE,X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,-slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,-slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE,X64-SSE4,X64-SSE4-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=X64-AVX,X64-XOP
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64-AVX,X64-AVX512DQ
@@ -264,22 +264,11 @@ define <2 x i64> @mul_v2i64_17(<2 x i64> %a0) nounwind {
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: ret{{[l|q]}}
;
-; X64-XOP-LABEL: mul_v2i64_17:
-; X64-XOP: # %bb.0:
-; X64-XOP-NEXT: vpsllq $4, %xmm0, %xmm1
-; X64-XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; X64-XOP-NEXT: retq
-;
-; X64-AVX2-LABEL: mul_v2i64_17:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpsllq $4, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: mul_v2i64_17:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX-LABEL: mul_v2i64_17:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpsllq $4, %xmm0, %xmm1
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 17, i64 17>
ret <2 x i64> %1
}
@@ -298,10 +287,18 @@ define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind {
; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE4-NEXT: retl
;
-; X64-SSE4-LABEL: mul_v4i32_17:
-; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE4-NEXT: retq
+; X64-SSE4-FAST-LABEL: mul_v4i32_17:
+; X64-SSE4-FAST: # %bb.0:
+; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-FAST-NEXT: retq
+;
+; X64-SSE4-SLOW-LABEL: mul_v4i32_17:
+; X64-SSE4-SLOW: # %bb.0:
+; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE4-SLOW-NEXT: pslld $4, %xmm1
+; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm1
+; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE4-SLOW-NEXT: retq
;
; X64-XOP-LABEL: mul_v4i32_17:
; X64-XOP: # %bb.0:
@@ -414,7 +411,8 @@ define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v4i64_17:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vpsllq $4, %ymm0, %ymm1
+; X64-AVX512DQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; X64-AVX512DQ-NEXT: retq
%1 = mul <4 x i64> %a0, <i64 17, i64 17, i64 17, i64 17>
ret <4 x i64> %1
@@ -433,12 +431,31 @@ define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: ret{{[l|q]}}
;
-; SSE4-LABEL: mul_v8i32_17:
-; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
-; SSE4-NEXT: pmulld %xmm2, %xmm0
-; SSE4-NEXT: pmulld %xmm2, %xmm1
-; SSE4-NEXT: ret{{[l|q]}}
+; X86-SSE4-LABEL: mul_v8i32_17:
+; X86-SSE4: # %bb.0:
+; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
+; X86-SSE4-NEXT: pmulld %xmm2, %xmm0
+; X86-SSE4-NEXT: pmulld %xmm2, %xmm1
+; X86-SSE4-NEXT: retl
+;
+; X64-SSE4-FAST-LABEL: mul_v8i32_17:
+; X64-SSE4-FAST: # %bb.0:
+; X64-SSE4-FAST-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
+; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm0
+; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm1
+; X64-SSE4-FAST-NEXT: retq
+;
+; X64-SSE4-SLOW-LABEL: mul_v8i32_17:
+; X64-SSE4-SLOW: # %bb.0:
+; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE4-SLOW-NEXT: pslld $4, %xmm2
+; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm2
+; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE4-SLOW-NEXT: pslld $4, %xmm3
+; X64-SSE4-SLOW-NEXT: paddd %xmm1, %xmm3
+; X64-SSE4-SLOW-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE4-SLOW-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE4-SLOW-NEXT: retq
;
; X64-XOP-LABEL: mul_v8i32_17:
; X64-XOP: # %bb.0:
@@ -553,26 +570,13 @@ define <2 x i64> @mul_v2i64_neg1025(<2 x i64> %a0) nounwind {
; SSE-NEXT: psubq %xmm1, %xmm0
; SSE-NEXT: ret{{[l|q]}}
;
-; X64-XOP-LABEL: mul_v2i64_neg1025:
-; X64-XOP: # %bb.0:
-; X64-XOP-NEXT: vpsllq $10, %xmm0, %xmm1
-; X64-XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; X64-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X64-XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; X64-XOP-NEXT: retq
-;
-; X64-AVX2-LABEL: mul_v2i64_neg1025:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpsllq $10, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X64-AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: mul_v2i64_neg1025:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX-LABEL: mul_v2i64_neg1025:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpsllq $10, %xmm0, %xmm1
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 -1025, i64 -1025>
ret <2 x i64> %1
}
@@ -592,10 +596,19 @@ define <4 x i32> @mul_v4i32_neg33(<4 x i32> %a0) nounwind {
; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE4-NEXT: retl
;
-; X64-SSE4-LABEL: mul_v4i32_neg33:
-; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE4-NEXT: retq
+; X64-SSE4-FAST-LABEL: mul_v4i32_neg33:
+; X64-SSE4-FAST: # %bb.0:
+; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-FAST-NEXT: retq
+;
+; X64-SSE4-SLOW-LABEL: mul_v4i32_neg33:
+; X64-SSE4-SLOW: # %bb.0:
+; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE4-SLOW-NEXT: pslld $5, %xmm1
+; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm1
+; X64-SSE4-SLOW-NEXT: pxor %xmm0, %xmm0
+; X64-SSE4-SLOW-NEXT: psubd %xmm1, %xmm0
+; X64-SSE4-SLOW-NEXT: retq
;
; X64-XOP-LABEL: mul_v4i32_neg33:
; X64-XOP: # %bb.0:
@@ -724,7 +737,10 @@ define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind {
;
; X64-AVX512DQ-LABEL: mul_v4i64_neg1025:
; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vpsllq $10, %ymm0, %ymm1
+; X64-AVX512DQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; X64-AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX512DQ-NEXT: vpsubq %ymm0, %ymm1, %ymm0
; X64-AVX512DQ-NEXT: retq
%1 = mul <4 x i64> %a0, <i64 -1025, i64 -1025, i64 -1025, i64 -1025>
ret <4 x i64> %1
@@ -746,12 +762,34 @@ define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: ret{{[l|q]}}
;
-; SSE4-LABEL: mul_v8i32_neg33:
-; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
-; SSE4-NEXT: pmulld %xmm2, %xmm0
-; SSE4-NEXT: pmulld %xmm2, %xmm1
-; SSE4-NEXT: ret{{[l|q]}}
+; X86-SSE4-LABEL: mul_v8i32_neg33:
+; X86-SSE4: # %bb.0:
+; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
+; X86-SSE4-NEXT: pmulld %xmm2, %xmm0
+; X86-SSE4-NEXT: pmulld %xmm2, %xmm1
+; X86-SSE4-NEXT: retl
+;
+; X64-SSE4-FAST-LABEL: mul_v8i32_neg33:
+; X64-SSE4-FAST: # %bb.0:
+; X64-SSE4-FAST-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
+; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm0
+; X64-SSE4-FAST-NEXT: pmulld %xmm2, %xmm1
+; X64-SSE4-FAST-NEXT: retq
+;
+; X64-SSE4-SLOW-LABEL: mul_v8i32_neg33:
+; X64-SSE4-SLOW: # %bb.0:
+; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE4-SLOW-NEXT: pslld $5, %xmm3
+; X64-SSE4-SLOW-NEXT: paddd %xmm0, %xmm3
+; X64-SSE4-SLOW-NEXT: pxor %xmm2, %xmm2
+; X64-SSE4-SLOW-NEXT: pxor %xmm0, %xmm0
+; X64-SSE4-SLOW-NEXT: psubd %xmm3, %xmm0
+; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE4-SLOW-NEXT: pslld $5, %xmm3
+; X64-SSE4-SLOW-NEXT: paddd %xmm1, %xmm3
+; X64-SSE4-SLOW-NEXT: psubd %xmm3, %xmm2
+; X64-SSE4-SLOW-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE4-SLOW-NEXT: retq
;
; X64-XOP-LABEL: mul_v8i32_neg33:
; X64-XOP: # %bb.0:
@@ -1070,22 +1108,11 @@ define <2 x i64> @mul_v2i64_7(<2 x i64> %a0) nounwind {
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: ret{{[l|q]}}
;
-; X64-XOP-LABEL: mul_v2i64_7:
-; X64-XOP: # %bb.0:
-; X64-XOP-NEXT: vpsllq $3, %xmm0, %xmm1
-; X64-XOP-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; X64-XOP-NEXT: retq
-;
-; X64-AVX2-LABEL: mul_v2i64_7:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpsllq $3, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: mul_v2i64_7:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX-LABEL: mul_v2i64_7:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm1
+; X64-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 7, i64 7>
ret <2 x i64> %1
}
@@ -1104,10 +1131,18 @@ define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind {
; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE4-NEXT: retl
;
-; X64-SSE4-LABEL: mul_v4i32_7:
-; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE4-NEXT: retq
+; X64-SSE4-FAST-LABEL: mul_v4i32_7:
+; X64-SSE4-FAST: # %bb.0:
+; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-FAST-NEXT: retq
+;
+; X64-SSE4-SLOW-LABEL: mul_v4i32_7:
+; X64-SSE4-SLOW: # %bb.0:
+; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE4-SLOW-NEXT: pslld $3, %xmm1
+; X64-SSE4-SLOW-NEXT: psubd %xmm0, %xmm1
+; X64-SSE4-SLOW-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE4-SLOW-NEXT: retq
;
; X64-XOP-LABEL: mul_v4i32_7:
; X64-XOP: # %bb.0:
@@ -1201,22 +1236,11 @@ define <2 x i64> @mul_v2i64_neg7(<2 x i64> %a0) nounwind {
; SSE-NEXT: psubq %xmm1, %xmm0
; SSE-NEXT: ret{{[l|q]}}
;
-; X64-XOP-LABEL: mul_v2i64_neg7:
-; X64-XOP: # %bb.0:
-; X64-XOP-NEXT: vpsllq $3, %xmm0, %xmm1
-; X64-XOP-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; X64-XOP-NEXT: retq
-;
-; X64-AVX2-LABEL: mul_v2i64_neg7:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpsllq $3, %xmm0, %xmm1
-; X64-AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: mul_v2i64_neg7:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX-LABEL: mul_v2i64_neg7:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm1
+; X64-AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
%1 = mul <2 x i64> %a0, <i64 -7, i64 -7>
ret <2 x i64> %1
}
@@ -1234,10 +1258,17 @@ define <4 x i32> @mul_v4i32_neg63(<4 x i32> %a0) nounwind {
; X86-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE4-NEXT: retl
;
-; X64-SSE4-LABEL: mul_v4i32_neg63:
-; X64-SSE4: # %bb.0:
-; X64-SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE4-NEXT: retq
+; X64-SSE4-FAST-LABEL: mul_v4i32_neg63:
+; X64-SSE4-FAST: # %bb.0:
+; X64-SSE4-FAST-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE4-FAST-NEXT: retq
+;
+; X64-SSE4-SLOW-LABEL: mul_v4i32_neg63:
+; X64-SSE4-SLOW: # %bb.0:
+; X64-SSE4-SLOW-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE4-SLOW-NEXT: pslld $6, %xmm1
+; X64-SSE4-SLOW-NEXT: psubd %xmm1, %xmm0
+; X64-SSE4-SLOW-NEXT: retq
;
; X64-XOP-LABEL: mul_v4i32_neg63:
; X64-XOP: # %bb.0:
More information about the llvm-commits
mailing list