[llvm] r344361 - [x86] add and use fast horizontal vector math subtarget feature
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 12 09:41:02 PDT 2018
Author: spatel
Date: Fri Oct 12 09:41:02 2018
New Revision: 344361
URL: http://llvm.org/viewvc/llvm-project?rev=344361&view=rev
Log:
[x86] add and use fast horizontal vector math subtarget feature
This is the planned follow-up to D52997. Here we are reducing horizontal vector math codegen
by default. AMD Jaguar (btver2) should have no difference with this patch because it has
fast-hops. (If we want to set that bit for other CPUs, let me know.)
The code changes are small, but there are many test diffs. For files that are specifically
testing for hops, I added RUNs to distinguish fast/slow, so we can see the consequences
side-by-side. For files that are primarily concerned with codegen other than hops, I just
updated the CHECK lines to reflect the new default codegen.
To recap the recent horizontal op story:
1. Before rL343727, we were producing hops for all subtargets for a variety of patterns.
Hops were likely not optimal for all targets though.
2. The IR improvement in r343727 exposed a hole in the backend hop pattern matching, so
we reduced hop codegen for all subtargets. That was bad for Jaguar (PR39195).
3. We restored the hop codegen for all targets with rL344141. Good for Jaguar, but
probably bad for other CPUs.
4. This patch allows us to distinguish when we want to produce hops, so everyone can be
happy. I'm not sure if we have the best predicate here, but the intent is to undo the
extra hop-iness that was enabled by r344141.
Differential Revision: https://reviews.llvm.org/D53095
Modified:
llvm/trunk/lib/Target/X86/X86.td
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86Subtarget.h
llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll
llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll
llvm/trunk/test/CodeGen/X86/haddsub-undef.ll
llvm/trunk/test/CodeGen/X86/haddsub.ll
llvm/trunk/test/CodeGen/X86/madd.ll
llvm/trunk/test/CodeGen/X86/phaddsub.ll
llvm/trunk/test/CodeGen/X86/required-vector-width.ll
llvm/trunk/test/CodeGen/X86/sad.ll
llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll
llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
Modified: llvm/trunk/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86.td (original)
+++ llvm/trunk/lib/Target/X86/X86.td Fri Oct 12 09:41:02 2018
@@ -404,6 +404,15 @@ def FeatureFastBEXTR : SubtargetFeature<
"Indicates that the BEXTR instruction is implemented as a single uop "
"with good throughput.">;
+// Combine vector math operations with shuffles into horizontal math
+// instructions if a CPU implements horizontal operations (introduced with
+// SSE3) with better latency/throughput than the alternative sequence.
+def FeatureFastHorizontalOps
+ : SubtargetFeature<
+ "fast-hops", "HasFastHorizontalOps", "true",
+ "Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
+ "normal vector instructions with shuffles", [FeatureSSE3]>;
+
// Merge branches using three-way conditional code.
def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
"ThreewayBranchProfitable", "true",
@@ -998,7 +1007,8 @@ def : ProcessorModel<"btver2", BtVer2Mod
FeatureLAHFSAHF,
FeatureFast15ByteNOP,
FeatureFastBEXTR,
- FeatureFastPartialYMMorZMMWrite
+ FeatureFastPartialYMMorZMMWrite,
+ FeatureFastHorizontalOps
]>;
// Bulldozer
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Oct 12 09:41:02 2018
@@ -37031,9 +37031,6 @@ static bool isHorizontalBinOp(SDValue &L
// The low half of the 128-bit result must choose from A.
// The high half of the 128-bit result must choose from B,
// unless B is undef. In that case, we are always choosing from A.
- // TODO: Using a horizontal op on a single input is likely worse for
- // performance on many CPUs, so this should be limited here or reversed
- // in a later pass.
unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
@@ -37051,6 +37048,16 @@ static bool isHorizontalBinOp(SDValue &L
return true;
}
+/// Horizontal vector math instructions may be slower than normal math with
+/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
+/// implementation, and likely shuffle complexity of the alternate sequence.
+static bool shouldCombineToHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize();
+ bool HasFastHOps = Subtarget.hasFastHorizontalOps();
+ return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+}
+
/// Do target-specific dag combines on floating-point adds/subs.
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -37063,7 +37070,8 @@ static SDValue combineFaddFsub(SDNode *N
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
- isHorizontalBinOp(LHS, RHS, IsFadd)) {
+ isHorizontalBinOp(LHS, RHS, IsFadd) &&
+ shouldCombineToHorizontalOp(LHS == RHS, DAG, Subtarget)) {
auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
}
@@ -39787,7 +39795,8 @@ static SDValue combineAdd(SDNode *N, Sel
// Try to synthesize horizontal adds from adds of shuffles.
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
- Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) {
+ Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) &&
+ shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
@@ -39918,7 +39927,8 @@ static SDValue combineSub(SDNode *N, Sel
EVT VT = N->getValueType(0);
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
- Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) {
+ Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) &&
+ shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Fri Oct 12 09:41:02 2018
@@ -388,6 +388,9 @@ protected:
/// Processor has a single uop BEXTR implementation.
bool HasFastBEXTR = false;
+ /// Try harder to combine to horizontal vector ops if they are fast.
+ bool HasFastHorizontalOps = false;
+
/// Use a retpoline thunk rather than indirect calls to block speculative
/// execution.
bool UseRetpolineIndirectCalls = false;
@@ -636,6 +639,7 @@ public:
bool hasFastLZCNT() const { return HasFastLZCNT; }
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
bool hasFastBEXTR() const { return HasFastBEXTR; }
+ bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
bool hasMacroFusion() const { return HasMacroFusion; }
bool hasERMSB() const { return HasERMSB; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
Modified: llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll Fri Oct 12 09:41:02 2018
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X32,X32-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X64,X64-FAST
define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) {
; X32-LABEL: phaddw1:
@@ -67,15 +69,29 @@ define <8 x i32> @phaddd2(<8 x i32> %x,
}
define <8 x i32> @phaddd3(<8 x i32> %x) {
-; X32-LABEL: phaddd3:
-; X32: # %bb.0:
-; X32-NEXT: vphaddd %ymm0, %ymm0, %ymm0
-; X32-NEXT: retl
+; X32-SLOW-LABEL: phaddd3:
+; X32-SLOW: # %bb.0:
+; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; X32-SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; X32-SLOW-NEXT: retl
;
-; X64-LABEL: phaddd3:
-; X64: # %bb.0:
-; X64-NEXT: vphaddd %ymm0, %ymm0, %ymm0
-; X64-NEXT: retq
+; X32-FAST-LABEL: phaddd3:
+; X32-FAST: # %bb.0:
+; X32-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; X32-FAST-NEXT: retl
+;
+; X64-SLOW-LABEL: phaddd3:
+; X64-SLOW: # %bb.0:
+; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; X64-SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; X64-SLOW-NEXT: retq
+;
+; X64-FAST-LABEL: phaddd3:
+; X64-FAST: # %bb.0:
+; X64-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; X64-FAST-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
%b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
%r = add <8 x i32> %a, %b
Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll Fri Oct 12 09:41:02 2018
@@ -6860,7 +6860,8 @@ define i32 @test_mm512_reduce_add_epi32(
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
@@ -6989,7 +6990,8 @@ define i32 @test_mm512_mask_reduce_add_e
; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovd %xmm0, %eax
; X86-NEXT: vzeroupper
; X86-NEXT: retl
@@ -7004,7 +7006,8 @@ define i32 @test_mm512_mask_reduce_add_e
; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; X64-NEXT: vmovd %xmm0, %eax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -7210,7 +7213,8 @@ define double @test_mm512_reduce_add_pd(
; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -7225,7 +7229,8 @@ define double @test_mm512_reduce_add_pd(
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
@@ -7405,7 +7410,8 @@ define double @test_mm512_mask_reduce_ad
; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -7422,7 +7428,8 @@ define double @test_mm512_mask_reduce_ad
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
Modified: llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll Fri Oct 12 09:41:02 2018
@@ -1,21 +1,54 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
define <4 x float> @hadd_v4f32(<4 x float> %a) {
-; SSSE3-LABEL: hadd_v4f32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: haddps %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hadd_v4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v4f32:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT: addps %xmm1, %xmm0
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hadd_v4f32:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v4f32:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v4f32:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hadd_v4f32:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hadd_v4f32:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT: retq
%a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
%a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
%hop = fadd <2 x float> %a02, %a13
@@ -54,16 +87,51 @@ define <8 x float> @hadd_v8f32a(<8 x flo
}
define <8 x float> @hadd_v8f32b(<8 x float> %a) {
-; SSSE3-LABEL: hadd_v8f32b:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: haddps %xmm0, %xmm0
-; SSSE3-NEXT: haddps %xmm1, %xmm1
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hadd_v8f32b:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v8f32b:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm3
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT: addps %xmm2, %xmm0
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT: addps %xmm3, %xmm1
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hadd_v8f32b:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0
+; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v8f32b:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v8f32b:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hadd_v8f32b:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hadd_v8f32b:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT: retq
%a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
%a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
%hop = fadd <8 x float> %a0, %a1
@@ -72,15 +140,45 @@ define <8 x float> @hadd_v8f32b(<8 x flo
}
define <4 x float> @hsub_v4f32(<4 x float> %a) {
-; SSSE3-LABEL: hsub_v4f32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: hsubps %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hsub_v4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v4f32:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT: subps %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hsub_v4f32:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v4f32:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v4f32:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hsub_v4f32:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hsub_v4f32:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT: retq
%a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
%a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
%hop = fsub <2 x float> %a02, %a13
@@ -119,16 +217,51 @@ define <8 x float> @hsub_v8f32a(<8 x flo
}
define <8 x float> @hsub_v8f32b(<8 x float> %a) {
-; SSSE3-LABEL: hsub_v8f32b:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: hsubps %xmm0, %xmm0
-; SSSE3-NEXT: hsubps %xmm1, %xmm1
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hsub_v8f32b:
-; AVX: # %bb.0:
-; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v8f32b:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm3
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT: subps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT: subps %xmm1, %xmm3
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hsub_v8f32b:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0
+; SSSE3_FAST-NEXT: hsubps %xmm1, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v8f32b:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v8f32b:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hsub_v8f32b:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hsub_v8f32b:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT: retq
%a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
%a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
%hop = fsub <8 x float> %a0, %a1
@@ -137,15 +270,42 @@ define <8 x float> @hsub_v8f32b(<8 x flo
}
define <2 x double> @hadd_v2f64(<2 x double> %a) {
-; SSSE3-LABEL: hadd_v2f64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: haddpd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hadd_v2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v2f64:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hadd_v2f64:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v2f64:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v2f64:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hadd_v2f64:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hadd_v2f64:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT: retq
%a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
%a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
%hop = fadd <2 x double> %a0, %a1
@@ -154,16 +314,47 @@ define <2 x double> @hadd_v2f64(<2 x dou
}
define <4 x double> @hadd_v4f64(<4 x double> %a) {
-; SSSE3-LABEL: hadd_v4f64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: haddpd %xmm0, %xmm0
-; SSSE3-NEXT: haddpd %xmm1, %xmm1
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hadd_v4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v4f64:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3
+; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSSE3_SLOW-NEXT: addpd %xmm1, %xmm3
+; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hadd_v4f64:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v4f64:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX1_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v4f64:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hadd_v4f64:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX2_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hadd_v4f64:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT: retq
%a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
%a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
%hop = fadd <4 x double> %a0, %a1
@@ -172,15 +363,42 @@ define <4 x double> @hadd_v4f64(<4 x dou
}
define <2 x double> @hsub_v2f64(<2 x double> %a) {
-; SSSE3-LABEL: hsub_v2f64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: hsubpd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hsub_v2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v2f64:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSSE3_SLOW-NEXT: subpd %xmm1, %xmm0
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hsub_v2f64:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v2f64:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v2f64:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hsub_v2f64:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hsub_v2f64:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT: retq
%a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
%a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
%hop = fsub <2 x double> %a0, %a1
@@ -189,16 +407,47 @@ define <2 x double> @hsub_v2f64(<2 x dou
}
define <4 x double> @hsub_v4f64(<4 x double> %a) {
-; SSSE3-LABEL: hsub_v4f64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: hsubpd %xmm0, %xmm0
-; SSSE3-NEXT: hsubpd %xmm1, %xmm1
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hsub_v4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v4f64:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3
+; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSSE3_SLOW-NEXT: subpd %xmm3, %xmm1
+; SSSE3_SLOW-NEXT: subpd %xmm2, %xmm0
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hsub_v4f64:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v4f64:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX1_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v4f64:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hsub_v4f64:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX2_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0
+; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hsub_v4f64:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT: retq
%a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
%a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
%hop = fsub <4 x double> %a0, %a1
@@ -207,15 +456,44 @@ define <4 x double> @hsub_v4f64(<4 x dou
}
define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
-; SSSE3-LABEL: hadd_v4i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hadd_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v4i32:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hadd_v4i32:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v4i32:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v4i32:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hadd_v4i32:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hadd_v4i32:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT: retq
%a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%hop = add <4 x i32> %a02, %a13
@@ -254,25 +532,57 @@ define <8 x i32> @hadd_v8i32a(<8 x i32>
}
define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
-; SSSE3-LABEL: hadd_v8i32b:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: phaddd %xmm1, %xmm1
-; SSSE3-NEXT: retq
-;
-; AVX1-LABEL: hadd_v8i32b:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: hadd_v8i32b:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v8i32b:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT: paddd %xmm2, %xmm0
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT: paddd %xmm3, %xmm1
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hadd_v8i32b:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3_FAST-NEXT: phaddd %xmm1, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v8i32b:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1_SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v8i32b:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hadd_v8i32b:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hadd_v8i32b:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT: retq
%a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
%a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
%hop = add <8 x i32> %a0, %a1
@@ -281,15 +591,44 @@ define <8 x i32> @hadd_v8i32b(<8 x i32>
}
define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
-; SSSE3-LABEL: hsub_v4i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phsubd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hsub_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v4i32:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hsub_v4i32:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v4i32:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v4i32:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hsub_v4i32:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hsub_v4i32:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT: retq
%a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%hop = sub <4 x i32> %a02, %a13
@@ -328,25 +667,57 @@ define <8 x i32> @hsub_v8i32a(<8 x i32>
}
define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
-; SSSE3-LABEL: hsub_v8i32b:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phsubd %xmm0, %xmm0
-; SSSE3-NEXT: phsubd %xmm1, %xmm1
-; SSSE3-NEXT: retq
-;
-; AVX1-LABEL: hsub_v8i32b:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: hsub_v8i32b:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vphsubd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v8i32b:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm3
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hsub_v8i32b:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0
+; SSSE3_FAST-NEXT: phsubd %xmm1, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v8i32b:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1_SLOW-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; AVX1_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v8i32b:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hsub_v8i32b:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT: vpsubd %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hsub_v8i32b:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT: retq
%a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
%a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
%hop = sub <8 x i32> %a0, %a1
@@ -355,15 +726,45 @@ define <8 x i32> @hsub_v8i32b(<8 x i32>
}
define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
-; SSSE3-LABEL: hadd_v8i16:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddw %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hadd_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v8i16:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT: paddw %xmm1, %xmm0
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hadd_v8i16:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v8i16:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v8i16:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hadd_v8i16:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX2_SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hadd_v8i16:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT: retq
%a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
%a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%hop = add <8 x i16> %a0246, %a1357
@@ -402,25 +803,64 @@ define <16 x i16> @hadd_v16i16a(<16 x i1
}
define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
-; SSSE3-LABEL: hadd_v16i16b:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddw %xmm0, %xmm0
-; SSSE3-NEXT: phaddw %xmm1, %xmm1
-; SSSE3-NEXT: retq
-;
-; AVX1-LABEL: hadd_v16i16b:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: hadd_v16i16b:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; SSSE3_SLOW-LABEL: hadd_v16i16b:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm3
+; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm3
+; SSSE3_SLOW-NEXT: movdqa %xmm1, %xmm4
+; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm4
+; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm0
+; SSSE3_SLOW-NEXT: paddw %xmm3, %xmm0
+; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm1
+; SSSE3_SLOW-NEXT: paddw %xmm4, %xmm1
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hadd_v16i16b:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3_FAST-NEXT: phaddw %xmm1, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hadd_v16i16b:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vpaddw %xmm0, %xmm2, %xmm0
+; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2
+; AVX1_SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hadd_v16i16b:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hadd_v16i16b:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
+; AVX2_SLOW-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hadd_v16i16b:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vphaddw %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT: retq
%a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
%a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
%hop = add <16 x i16> %a0, %a1
@@ -429,15 +869,45 @@ define <16 x i16> @hadd_v16i16b(<16 x i1
}
define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
-; SSSE3-LABEL: hsub_v8i16:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phsubw %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: hsub_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vphsubw %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v8i16:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT: psubw %xmm0, %xmm1
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hsub_v8i16:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v8i16:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v8i16:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hsub_v8i16:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX2_SLOW-NEXT: vpsubw %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hsub_v8i16:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT: retq
%a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
%a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%hop = sub <8 x i16> %a0246, %a1357
@@ -476,25 +946,64 @@ define <16 x i16> @hsub_v16i16a(<16 x i1
}
define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
-; SSSE3-LABEL: hsub_v16i16b:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phsubw %xmm0, %xmm0
-; SSSE3-NEXT: phsubw %xmm1, %xmm1
-; SSSE3-NEXT: retq
-;
-; AVX1-LABEL: hsub_v16i16b:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: hsub_v16i16b:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vphsubw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; SSSE3_SLOW-LABEL: hsub_v16i16b:
+; SSSE3_SLOW: # %bb.0:
+; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm3
+; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm3
+; SSSE3_SLOW-NEXT: movdqa %xmm1, %xmm4
+; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm4
+; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm0
+; SSSE3_SLOW-NEXT: psubw %xmm0, %xmm3
+; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm1
+; SSSE3_SLOW-NEXT: psubw %xmm1, %xmm4
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1]
+; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
+; SSSE3_SLOW-NEXT: retq
+;
+; SSSE3_FAST-LABEL: hsub_v16i16b:
+; SSSE3_FAST: # %bb.0:
+; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0
+; SSSE3_FAST-NEXT: phsubw %xmm1, %xmm1
+; SSSE3_FAST-NEXT: retq
+;
+; AVX1_SLOW-LABEL: hsub_v16i16b:
+; AVX1_SLOW: # %bb.0:
+; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2
+; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1_SLOW-NEXT: vpsubw %xmm0, %xmm2, %xmm0
+; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2
+; AVX1_SLOW-NEXT: vpsubw %xmm2, %xmm1, %xmm1
+; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT: retq
+;
+; AVX1_FAST-LABEL: hsub_v16i16b:
+; AVX1_FAST: # %bb.0:
+; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT: retq
+;
+; AVX2_SLOW-LABEL: hsub_v16i16b:
+; AVX2_SLOW: # %bb.0:
+; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
+; AVX2_SLOW-NEXT: vpsubw %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT: retq
+;
+; AVX2_FAST-LABEL: hsub_v16i16b:
+; AVX2_FAST: # %bb.0:
+; AVX2_FAST-NEXT: vphsubw %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT: retq
%a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
%a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
%hop = sub <16 x i16> %a0, %a1
Modified: llvm/trunk/test/CodeGen/X86/haddsub-undef.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub-undef.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub-undef.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub-undef.ll Fri Oct 12 09:41:02 2018
@@ -1,7 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
@@ -339,8 +342,6 @@ define <8 x i32> @test14_undef(<8 x i32>
ret <8 x i32> %vecinit5
}
-; On AVX2, the following sequence can be folded into a single horizontal add.
-; If the Subtarget doesn't support AVX2, then we avoid emitting two packed
; integer horizontal adds instead of two scalar adds followed by vector inserts.
define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: test15_undef:
@@ -451,15 +452,38 @@ define <8 x i32> @test17_undef(<8 x i32>
}
define <2 x double> @add_pd_003(<2 x double> %x) {
-; SSE-LABEL: add_pd_003:
-; SSE: # %bb.0:
-; SSE-NEXT: haddpd %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: add_pd_003:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE-SLOW-LABEL: add_pd_003:
+; SSE-SLOW: # %bb.0:
+; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT: addpd %xmm1, %xmm0
+; SSE-SLOW-NEXT: retq
+;
+; SSE-FAST-LABEL: add_pd_003:
+; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
+; SSE-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: add_pd_003:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: add_pd_003:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: add_pd_003:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: add_pd_003:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: retq
%l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
%add = fadd <2 x double> %l, %x
ret <2 x double> %add
@@ -468,31 +492,84 @@ define <2 x double> @add_pd_003(<2 x dou
; Change shuffle mask - no undefs.
define <2 x double> @add_pd_003_2(<2 x double> %x) {
-; SSE-LABEL: add_pd_003_2:
-; SSE: # %bb.0:
-; SSE-NEXT: haddpd %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: add_pd_003_2:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE-SLOW-LABEL: add_pd_003_2:
+; SSE-SLOW: # %bb.0:
+; SSE-SLOW-NEXT: movapd %xmm0, %xmm1
+; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE-SLOW-NEXT: addpd %xmm0, %xmm1
+; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE-SLOW-NEXT: retq
+;
+; SSE-FAST-LABEL: add_pd_003_2:
+; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
+; SSE-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: add_pd_003_2:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: add_pd_003_2:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: add_pd_003_2:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: add_pd_003_2:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: retq
%l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
%add = fadd <2 x double> %l, %x
ret <2 x double> %add
}
define <2 x double> @add_pd_010(<2 x double> %x) {
-; SSE-LABEL: add_pd_010:
-; SSE: # %bb.0:
-; SSE-NEXT: haddpd %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: add_pd_010:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: retq
+; SSE-SLOW-LABEL: add_pd_010:
+; SSE-SLOW: # %bb.0:
+; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT: addpd %xmm0, %xmm1
+; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE-SLOW-NEXT: retq
+;
+; SSE-FAST-LABEL: add_pd_010:
+; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: haddpd %xmm0, %xmm0
+; SSE-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: add_pd_010:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: add_pd_010:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: add_pd_010:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: add_pd_010:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-FAST-NEXT: retq
%l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
%add = fadd <2 x double> %l, %x
%shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -500,15 +577,42 @@ define <2 x double> @add_pd_010(<2 x dou
}
define <4 x float> @add_ps_007(<4 x float> %x) {
-; SSE-LABEL: add_ps_007:
-; SSE: # %bb.0:
-; SSE-NEXT: haddps %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: add_ps_007:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE-SLOW-LABEL: add_ps_007:
+; SSE-SLOW: # %bb.0:
+; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE-SLOW-NEXT: retq
+;
+; SSE-FAST-LABEL: add_ps_007:
+; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: add_ps_007:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: add_ps_007:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: add_ps_007:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: add_ps_007:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: retq
%l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
%r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
%add = fadd <4 x float> %l, %r
@@ -516,17 +620,48 @@ define <4 x float> @add_ps_007(<4 x floa
}
define <4 x float> @add_ps_030(<4 x float> %x) {
-; SSE-LABEL: add_ps_030:
-; SSE: # %bb.0:
-; SSE-NEXT: haddps %xmm0, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: add_ps_030:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX-NEXT: retq
+; SSE-SLOW-LABEL: add_ps_030:
+; SSE-SLOW: # %bb.0:
+; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSE-SLOW-NEXT: retq
+;
+; SSE-FAST-LABEL: add_ps_030:
+; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSE-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: add_ps_030:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: add_ps_030:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: add_ps_030:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: add_ps_030:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX2-FAST-NEXT: retq
%l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
%r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
%add = fadd <4 x float> %l, %r
@@ -535,15 +670,41 @@ define <4 x float> @add_ps_030(<4 x floa
}
define <4 x float> @add_ps_007_2(<4 x float> %x) {
-; SSE-LABEL: add_ps_007_2:
-; SSE: # %bb.0:
-; SSE-NEXT: haddps %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: add_ps_007_2:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE-SLOW-LABEL: add_ps_007_2:
+; SSE-SLOW: # %bb.0:
+; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE-SLOW-NEXT: retq
+;
+; SSE-FAST-LABEL: add_ps_007_2:
+; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: add_ps_007_2:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: add_ps_007_2:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: add_ps_007_2:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: add_ps_007_2:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: retq
%l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
%r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
%add = fadd <4 x float> %l, %r
@@ -551,32 +712,83 @@ define <4 x float> @add_ps_007_2(<4 x fl
}
define <4 x float> @add_ps_008(<4 x float> %x) {
-; SSE-LABEL: add_ps_008:
-; SSE: # %bb.0:
-; SSE-NEXT: haddps %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: add_ps_008:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE-SLOW-LABEL: add_ps_008:
+; SSE-SLOW: # %bb.0:
+; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE-SLOW-NEXT: retq
+;
+; SSE-FAST-LABEL: add_ps_008:
+; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: add_ps_008:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: add_ps_008:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: add_ps_008:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: add_ps_008:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: retq
%l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
%add = fadd <4 x float> %l, %x
ret <4 x float> %add
}
define <4 x float> @add_ps_017(<4 x float> %x) {
-; SSE-LABEL: add_ps_017:
-; SSE: # %bb.0:
-; SSE-NEXT: haddps %xmm0, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: add_ps_017:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: retq
+; SSE-SLOW-LABEL: add_ps_017:
+; SSE-SLOW: # %bb.0:
+; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE-SLOW-NEXT: addps %xmm0, %xmm1
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE-SLOW-NEXT: retq
+;
+; SSE-FAST-LABEL: add_ps_017:
+; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: add_ps_017:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: add_ps_017:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: add_ps_017:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: add_ps_017:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-FAST-NEXT: retq
%l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
%add = fadd <4 x float> %l, %x
%shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
@@ -584,17 +796,47 @@ define <4 x float> @add_ps_017(<4 x floa
}
define <4 x float> @add_ps_018(<4 x float> %x) {
-; SSE-LABEL: add_ps_018:
-; SSE: # %bb.0:
-; SSE-NEXT: haddps %xmm0, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: add_ps_018:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: retq
+; SSE-SLOW-LABEL: add_ps_018:
+; SSE-SLOW: # %bb.0:
+; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-SLOW-NEXT: retq
+;
+; SSE-FAST-LABEL: add_ps_018:
+; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-FAST-NEXT: retq
+;
+; AVX1-SLOW-LABEL: add_ps_018:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: add_ps_018:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: add_ps_018:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: add_ps_018:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-FAST-NEXT: retq
%l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
%r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
%add = fadd <4 x float> %l, %r
Modified: llvm/trunk/test/CodeGen/X86/haddsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/haddsub.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/haddsub.ll (original)
+++ llvm/trunk/test/CodeGen/X86/haddsub.ll Fri Oct 12 09:41:02 2018
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
; SSE3-LABEL: haddpd1:
@@ -35,15 +37,29 @@ define <2 x double> @haddpd2(<2 x double
}
define <2 x double> @haddpd3(<2 x double> %x) {
-; SSE3-LABEL: haddpd3:
-; SSE3: # %bb.0:
-; SSE3-NEXT: haddpd %xmm0, %xmm0
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: haddpd3:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: haddpd3:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
+; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1
+; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: haddpd3:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: haddpd3:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: haddpd3:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
%b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
%r = fadd <2 x double> %a, %b
@@ -83,15 +99,30 @@ define <4 x float> @haddps2(<4 x float>
}
define <4 x float> @haddps3(<4 x float> %x) {
-; SSE3-LABEL: haddps3:
-; SSE3: # %bb.0:
-; SSE3-NEXT: haddps %xmm0, %xmm0
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: haddps3:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: haddps3:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: haddps3:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: haddps3:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: haddps3:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = fadd <4 x float> %a, %b
@@ -99,15 +130,30 @@ define <4 x float> @haddps3(<4 x float>
}
define <4 x float> @haddps4(<4 x float> %x) {
-; SSE3-LABEL: haddps4:
-; SSE3: # %bb.0:
-; SSE3-NEXT: haddps %xmm0, %xmm0
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: haddps4:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: haddps4:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: haddps4:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: haddps4:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: haddps4:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
@@ -115,15 +161,30 @@ define <4 x float> @haddps4(<4 x float>
}
define <4 x float> @haddps5(<4 x float> %x) {
-; SSE3-LABEL: haddps5:
-; SSE3: # %bb.0:
-; SSE3-NEXT: haddps %xmm0, %xmm0
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: haddps5:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: haddps5:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,3]
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: haddps5:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: haddps5:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,3]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: haddps5:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
@@ -131,15 +192,27 @@ define <4 x float> @haddps5(<4 x float>
}
define <4 x float> @haddps6(<4 x float> %x) {
-; SSE3-LABEL: haddps6:
-; SSE3: # %bb.0:
-; SSE3-NEXT: haddps %xmm0, %xmm0
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: haddps6:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: haddps6:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: haddps6:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: haddps6:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: haddps6:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
@@ -147,15 +220,30 @@ define <4 x float> @haddps6(<4 x float>
}
define <4 x float> @haddps7(<4 x float> %x) {
-; SSE3-LABEL: haddps7:
-; SSE3: # %bb.0:
-; SSE3-NEXT: haddps %xmm0, %xmm0
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: haddps7:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: haddps7:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: haddps7:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: haddps7:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: haddps7:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
@@ -179,15 +267,28 @@ define <2 x double> @hsubpd1(<2 x double
}
define <2 x double> @hsubpd2(<2 x double> %x) {
-; SSE3-LABEL: hsubpd2:
-; SSE3: # %bb.0:
-; SSE3-NEXT: hsubpd %xmm0, %xmm0
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: hsubpd2:
-; AVX: # %bb.0:
-; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: hsubpd2:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
+; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: hsubpd2:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: hsubpd2:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: hsubpd2:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
%b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
%r = fsub <2 x double> %a, %b
@@ -211,15 +312,31 @@ define <4 x float> @hsubps1(<4 x float>
}
define <4 x float> @hsubps2(<4 x float> %x) {
-; SSE3-LABEL: hsubps2:
-; SSE3: # %bb.0:
-; SSE3-NEXT: hsubps %xmm0, %xmm0
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: hsubps2:
-; AVX: # %bb.0:
-; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: hsubps2:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE3-SLOW-NEXT: subps %xmm0, %xmm1
+; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: hsubps2:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: hsubps2:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: hsubps2:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = fsub <4 x float> %a, %b
@@ -227,15 +344,31 @@ define <4 x float> @hsubps2(<4 x float>
}
define <4 x float> @hsubps3(<4 x float> %x) {
-; SSE3-LABEL: hsubps3:
-; SSE3: # %bb.0:
-; SSE3-NEXT: hsubps %xmm0, %xmm0
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: hsubps3:
-; AVX: # %bb.0:
-; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: hsubps3:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT: subps %xmm0, %xmm1
+; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: hsubps3:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: hsubps3:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: hsubps3:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = fsub <4 x float> %a, %b
@@ -243,15 +376,27 @@ define <4 x float> @hsubps3(<4 x float>
}
define <4 x float> @hsubps4(<4 x float> %x) {
-; SSE3-LABEL: hsubps4:
-; SSE3: # %bb.0:
-; SSE3-NEXT: hsubps %xmm0, %xmm0
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: hsubps4:
-; AVX: # %bb.0:
-; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: hsubps4:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE3-SLOW-NEXT: subps %xmm1, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: hsubps4:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: hsubps4:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: hsubps4:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = fsub <4 x float> %a, %b
@@ -293,16 +438,35 @@ define <8 x float> @vhaddps2(<8 x float>
}
define <8 x float> @vhaddps3(<8 x float> %x) {
-; SSE3-LABEL: vhaddps3:
-; SSE3: # %bb.0:
-; SSE3-NEXT: haddps %xmm0, %xmm0
-; SSE3-NEXT: haddps %xmm1, %xmm1
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: vhaddps3:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: vhaddps3:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
+; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE3-SLOW-NEXT: addps %xmm2, %xmm1
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT: addps %xmm3, %xmm0
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: vhaddps3:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT: haddps %xmm1, %xmm1
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: vhaddps3:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX-SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: vhaddps3:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
%b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
%r = fadd <8 x float> %a, %b
@@ -327,16 +491,37 @@ define <8 x float> @vhsubps1(<8 x float>
}
define <8 x float> @vhsubps3(<8 x float> %x) {
-; SSE3-LABEL: vhsubps3:
-; SSE3: # %bb.0:
-; SSE3-NEXT: hsubps %xmm0, %xmm0
-; SSE3-NEXT: hsubps %xmm1, %xmm1
-; SSE3-NEXT: retq
-;
-; AVX-LABEL: vhsubps3:
-; AVX: # %bb.0:
-; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
-; AVX-NEXT: retq
+; SSE3-SLOW-LABEL: vhsubps3:
+; SSE3-SLOW: # %bb.0:
+; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
+; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE3-SLOW-NEXT: subps %xmm1, %xmm2
+; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT: subps %xmm0, %xmm3
+; SSE3-SLOW-NEXT: movaps %xmm3, %xmm0
+; SSE3-SLOW-NEXT: movaps %xmm2, %xmm1
+; SSE3-SLOW-NEXT: retq
+;
+; SSE3-FAST-LABEL: vhsubps3:
+; SSE3-FAST: # %bb.0:
+; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT: hsubps %xmm1, %xmm1
+; SSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: vhsubps3:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX-SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: vhsubps3:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
%b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
%r = fsub <8 x float> %a, %b
Modified: llvm/trunk/test/CodeGen/X86/madd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/madd.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/madd.ll (original)
+++ llvm/trunk/test/CodeGen/X86/madd.ll Fri Oct 12 09:41:02 2018
@@ -50,7 +50,8 @@ define i32 @_Z10test_shortPsS_i_128(i16*
; AVX-NEXT: # %bb.2: # %middle.block
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
entry:
@@ -129,7 +130,8 @@ define i32 @_Z10test_shortPsS_i_256(i16*
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -153,7 +155,8 @@ define i32 @_Z10test_shortPsS_i_256(i16*
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vmovd %xmm0, %eax
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
@@ -252,7 +255,8 @@ define i32 @_Z10test_shortPsS_i_512(i16*
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -278,7 +282,8 @@ define i32 @_Z10test_shortPsS_i_512(i16*
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -437,7 +442,8 @@ define i32 @_Z10test_shortPsS_i_1024(i16
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -469,7 +475,8 @@ define i32 @_Z10test_shortPsS_i_1024(i16
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -620,7 +627,8 @@ define i32 @_Z9test_charPcS_i_128(i8* no
; AVX-NEXT: # %bb.2: # %middle.block
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
entry:
@@ -704,7 +712,8 @@ define i32 @_Z9test_charPcS_i_256(i8* no
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -729,7 +738,8 @@ define i32 @_Z9test_charPcS_i_256(i8* no
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vmovd %xmm0, %eax
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
@@ -836,7 +846,8 @@ define i32 @_Z9test_charPcS_i_512(i8* no
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -863,7 +874,8 @@ define i32 @_Z9test_charPcS_i_512(i8* no
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1039,7 +1051,8 @@ define i32 @_Z9test_charPcS_i_1024(i8* n
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1073,7 +1086,8 @@ define i32 @_Z9test_charPcS_i_1024(i8* n
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1222,7 +1236,8 @@ define i32 @test_unsigned_short_128(i16*
; AVX-NEXT: # %bb.2: # %middle.block
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
entry:
@@ -1313,7 +1328,8 @@ define i32 @test_unsigned_short_256(i16*
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1338,7 +1354,8 @@ define i32 @test_unsigned_short_256(i16*
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vmovd %xmm0, %eax
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
@@ -1460,7 +1477,8 @@ define i32 @test_unsigned_short_512(i16*
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1491,7 +1509,8 @@ define i32 @test_unsigned_short_512(i16*
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1699,7 +1718,8 @@ define i32 @test_unsigned_short_1024(i16
; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1742,7 +1762,8 @@ define i32 @test_unsigned_short_1024(i16
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -2692,7 +2713,8 @@ define i32 @madd_double_reduction(<8 x i
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: retq
;
@@ -2707,7 +2729,8 @@ define i32 @madd_double_reduction(<8 x i
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX256-NEXT: vmovd %xmm0, %eax
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/phaddsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/phaddsub.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/phaddsub.ll (original)
+++ llvm/trunk/test/CodeGen/X86/phaddsub.ll Fri Oct 12 09:41:02 2018
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phaddw1:
@@ -67,15 +69,29 @@ define <4 x i32> @phaddd2(<4 x i32> %x,
}
define <4 x i32> @phaddd3(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd3:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd3:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd3:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd3:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd3:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd3:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = add <4 x i32> %a, %b
@@ -83,15 +99,29 @@ define <4 x i32> @phaddd3(<4 x i32> %x)
}
define <4 x i32> @phaddd4(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd4:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd4:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd4:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd4:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd4:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd4:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
@@ -99,15 +129,29 @@ define <4 x i32> @phaddd4(<4 x i32> %x)
}
define <4 x i32> @phaddd5(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd5:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd5:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd5:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd5:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd5:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,3]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd5:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
@@ -115,15 +159,27 @@ define <4 x i32> @phaddd5(<4 x i32> %x)
}
define <4 x i32> @phaddd6(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd6:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd6:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd6:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd6:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd6:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd6:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
@@ -131,15 +187,29 @@ define <4 x i32> @phaddd6(<4 x i32> %x)
}
define <4 x i32> @phaddd7(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd7:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd7:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd7:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd7:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd7:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd7:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
@@ -179,15 +249,30 @@ define <4 x i32> @phsubd1(<4 x i32> %x,
}
define <4 x i32> @phsubd2(<4 x i32> %x) {
-; SSSE3-LABEL: phsubd2:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phsubd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phsubd2:
-; AVX: # %bb.0:
-; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phsubd2:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT: psubd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phsubd2:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phsubd2:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phsubd2:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = sub <4 x i32> %a, %b
@@ -195,15 +280,30 @@ define <4 x i32> @phsubd2(<4 x i32> %x)
}
define <4 x i32> @phsubd3(<4 x i32> %x) {
-; SSSE3-LABEL: phsubd3:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phsubd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phsubd3:
-; AVX: # %bb.0:
-; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phsubd3:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-SLOW-NEXT: psubd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phsubd3:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phsubd3:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phsubd3:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = sub <4 x i32> %a, %b
@@ -211,15 +311,27 @@ define <4 x i32> @phsubd3(<4 x i32> %x)
}
define <4 x i32> @phsubd4(<4 x i32> %x) {
-; SSSE3-LABEL: phsubd4:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phsubd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phsubd4:
-; AVX: # %bb.0:
-; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phsubd4:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSSE3-SLOW-NEXT: psubd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phsubd4:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phsubd4:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phsubd4:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = sub <4 x i32> %a, %b
@@ -284,15 +396,29 @@ define <4 x i32> @phsubd1_reverse(<4 x i
}
define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source1:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd_single_source1:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd_single_source1:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd_single_source1:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source1:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd_single_source1:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
%r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
%add = add <4 x i32> %l, %r
@@ -300,17 +426,33 @@ define <4 x i32> @phaddd_single_source1(
}
define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source2:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd_single_source2:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd_single_source2:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd_single_source2:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source2:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd_single_source2:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX-FAST-NEXT: retq
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
%r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
%add = add <4 x i32> %l, %r
@@ -319,15 +461,29 @@ define <4 x i32> @phaddd_single_source2(
}
define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source3:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd_single_source3:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd_single_source3:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd_single_source3:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source3:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd_single_source3:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
%r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
%add = add <4 x i32> %l, %r
@@ -335,32 +491,58 @@ define <4 x i32> @phaddd_single_source3(
}
define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source4:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd_single_source4:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd_single_source4:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd_single_source4:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source4:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd_single_source4:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
%add = add <4 x i32> %l, %x
ret <4 x i32> %add
}
define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source5:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd_single_source5:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd_single_source5:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd_single_source5:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source5:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd_single_source5:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-FAST-NEXT: retq
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
%add = add <4 x i32> %l, %x
%shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
@@ -368,17 +550,33 @@ define <4 x i32> @phaddd_single_source5(
}
define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source6:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddd_single_source6:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd_single_source6:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddd_single_source6:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source6:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd_single_source6:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-FAST-NEXT: retq
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
%r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
%add = add <4 x i32> %l, %r
@@ -387,15 +585,30 @@ define <4 x i32> @phaddd_single_source6(
}
define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source1:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddw %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddw_single_source1:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddw_single_source1:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13]
+; SSSE3-SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15]
+; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddw_single_source1:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source1:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13]
+; AVX-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15]
+; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddw_single_source1:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
%r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
%add = add <8 x i16> %l, %r
@@ -403,19 +616,41 @@ define <8 x i16> @phaddw_single_source1(
}
define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source2:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddw %xmm0, %xmm0
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddw_single_source2:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddw_single_source2:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddw_single_source2:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source2:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddw_single_source2:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX-FAST-NEXT: retq
%l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
%r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
%add = add <8 x i16> %l, %r
@@ -424,15 +659,33 @@ define <8 x i16> @phaddw_single_source2(
}
define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source3:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddw %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddw_single_source3:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddw_single_source3:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddw_single_source3:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source3:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddw_single_source3:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef>
%r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef>
%add = add <8 x i16> %l, %r
@@ -440,32 +693,63 @@ define <8 x i16> @phaddw_single_source3(
}
define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source4:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddw %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddw_single_source4:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddw_single_source4:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: pslld $16, %xmm1
+; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1
+; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddw_single_source4:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source4:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpslld $16, %xmm0, %xmm1
+; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddw_single_source4:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: retq
%l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6>
%add = add <8 x i16> %l, %x
ret <8 x i16> %add
}
define <8 x i16> @phaddw_single_source6(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source6:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddw %xmm0, %xmm0
-; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; AVX-LABEL: phaddw_single_source6:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddw_single_source6:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-SLOW-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddw_single_source6:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source6:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddw_single_source6:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-FAST-NEXT: retq
%l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
%r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
%add = add <8 x i16> %l, %r
Modified: llvm/trunk/test/CodeGen/X86/required-vector-width.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/required-vector-width.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/required-vector-width.ll (original)
+++ llvm/trunk/test/CodeGen/X86/required-vector-width.ll Fri Oct 12 09:41:02 2018
@@ -190,7 +190,8 @@ define i32 @_Z9test_charPcS_i_256(i8* no
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -319,7 +320,8 @@ define i32 @sad_16i8_256() "required-vec
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/sad.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sad.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sad.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sad.ll Fri Oct 12 09:41:02 2018
@@ -56,7 +56,8 @@ define i32 @sad_16i8() nounwind {
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -80,7 +81,8 @@ define i32 @sad_16i8() nounwind {
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -152,16 +154,16 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: pxor %xmm12, %xmm12
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: pxor %xmm13, %xmm13
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm15, %xmm15
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB1_1: # %vector.body
@@ -219,17 +221,17 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: paddd %xmm6, %xmm7
; SSE2-NEXT: pxor %xmm6, %xmm7
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE2-NEXT: paddd %xmm7, %xmm6
-; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm4, %xmm6
; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: paddd %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm4
; SSE2-NEXT: movdqa %xmm10, %xmm6
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; SSE2-NEXT: paddd %xmm4, %xmm7
-; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: paddd %xmm4, %xmm1
@@ -244,9 +246,9 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -256,9 +258,9 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm8, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm8
@@ -267,13 +269,13 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB1_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm15, %xmm0
; SSE2-NEXT: paddd %xmm14, %xmm13
; SSE2-NEXT: paddd %xmm0, %xmm13
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm13, %xmm6
; SSE2-NEXT: paddd %xmm0, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1]
@@ -317,7 +319,8 @@ define i32 @sad_32i8() nounwind {
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -343,7 +346,8 @@ define i32 @sad_32i8() nounwind {
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -420,42 +424,42 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB2_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movaps a+1040(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa a+1024(%rax), %xmm12
; SSE2-NEXT: movdqa a+1056(%rax), %xmm15
; SSE2-NEXT: movdqa a+1072(%rax), %xmm4
@@ -516,7 +520,7 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
; SSE2-NEXT: psubd %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
; SSE2-NEXT: psubd %xmm0, %xmm15
; SSE2-NEXT: movdqa %xmm7, %xmm0
@@ -524,8 +528,8 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; SSE2-NEXT: psubd %xmm3, %xmm9
-; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
@@ -534,7 +538,7 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
; SSE2-NEXT: psubd %xmm0, %xmm13
-; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm9, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
@@ -563,16 +567,16 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm6, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm6
; SSE2-NEXT: pxor %xmm1, %xmm6
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm6, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm5
@@ -584,118 +588,118 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm8, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm8
; SSE2-NEXT: pxor %xmm1, %xmm8
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm11, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm11
; SSE2-NEXT: pxor %xmm1, %xmm11
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm11, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm15, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm15
; SSE2-NEXT: pxor %xmm1, %xmm15
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm10, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm10
; SSE2-NEXT: pxor %xmm1, %xmm10
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm12, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm12
; SSE2-NEXT: pxor %xmm1, %xmm12
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm9, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm9
; SSE2-NEXT: pxor %xmm0, %xmm9
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm7
; SSE2-NEXT: pxor %xmm0, %xmm7
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm7, %xmm0
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm13, %xmm1
; SSE2-NEXT: movdqa %xmm13, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB2_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm1, %xmm4
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd (%rsp), %xmm1 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: paddd %xmm2, %xmm1
@@ -737,30 +741,30 @@ define i32 @sad_avx64i8() nounwind {
; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm4
@@ -803,27 +807,27 @@ define i32 @sad_avx64i8() nounwind {
; AVX1-NEXT: vpabsd %xmm4, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: vpaddd %xmm13, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13
-; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: vpaddd %xmm8, %xmm1, %xmm1
-; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8
; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: vpaddd %xmm9, %xmm1, %xmm1
-; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: vpaddd %xmm10, %xmm1, %xmm1
-; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm10
; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
@@ -858,7 +862,8 @@ define i32 @sad_avx64i8() nounwind {
; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: addq $24, %rsp
; AVX1-NEXT: vzeroupper
@@ -886,10 +891,10 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8
-; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -903,9 +908,9 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15
-; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7
; AVX2-NEXT: vpabsd %ymm9, %ymm8
; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
@@ -935,7 +940,8 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1430,7 +1436,8 @@ define i32 @sad_unroll_nonzero_initial(<
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: retq
;
@@ -1448,7 +1455,8 @@ define i32 @sad_unroll_nonzero_initial(<
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1533,7 +1541,8 @@ define i32 @sad_double_reduction(<16 x i
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: retq
;
@@ -1548,7 +1557,8 @@ define i32 @sad_double_reduction(<16 x i
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll Fri Oct 12 09:41:02 2018
@@ -195,28 +195,21 @@ define i64 @test_v16i64(<16 x i64> %a0)
;
define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE2-LABEL: test_v4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: phaddd %xmm1, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v4i32:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
;
@@ -224,7 +217,8 @@ define i32 @test_v4i32(<4 x i32> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: retq
%1 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a0)
@@ -232,24 +226,15 @@ define i32 @test_v4i32(<4 x i32> %a0) {
}
define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE2-LABEL: test_v8i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: paddd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: phaddd %xmm1, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v8i32:
+; SSE: # %bb.0:
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i32:
; AVX1: # %bb.0:
@@ -257,7 +242,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -268,7 +254,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -279,7 +266,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -288,28 +276,17 @@ define i32 @test_v8i32(<8 x i32> %a0) {
}
define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE2-LABEL: test_v16i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: paddd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: paddd %xmm3, %xmm1
-; SSE41-NEXT: paddd %xmm2, %xmm1
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: paddd %xmm1, %xmm0
-; SSE41-NEXT: phaddd %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v16i32:
+; SSE: # %bb.0:
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: paddd %xmm2, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i32:
; AVX1: # %bb.0:
@@ -320,7 +297,8 @@ define i32 @test_v16i32(<16 x i32> %a0)
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -332,7 +310,8 @@ define i32 @test_v16i32(<16 x i32> %a0)
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -355,36 +334,21 @@ define i32 @test_v16i32(<16 x i32> %a0)
}
define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE2-LABEL: test_v32i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: paddd %xmm6, %xmm2
-; SSE2-NEXT: paddd %xmm7, %xmm3
-; SSE2-NEXT: paddd %xmm5, %xmm3
-; SSE2-NEXT: paddd %xmm1, %xmm3
-; SSE2-NEXT: paddd %xmm4, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm2
-; SSE2-NEXT: paddd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: paddd %xmm6, %xmm2
-; SSE41-NEXT: paddd %xmm7, %xmm3
-; SSE41-NEXT: paddd %xmm5, %xmm3
-; SSE41-NEXT: paddd %xmm1, %xmm3
-; SSE41-NEXT: paddd %xmm4, %xmm2
-; SSE41-NEXT: paddd %xmm3, %xmm2
-; SSE41-NEXT: paddd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: paddd %xmm2, %xmm0
-; SSE41-NEXT: phaddd %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v32i32:
+; SSE: # %bb.0:
+; SSE-NEXT: paddd %xmm6, %xmm2
+; SSE-NEXT: paddd %xmm7, %xmm3
+; SSE-NEXT: paddd %xmm5, %xmm3
+; SSE-NEXT: paddd %xmm1, %xmm3
+; SSE-NEXT: paddd %xmm4, %xmm2
+; SSE-NEXT: paddd %xmm3, %xmm2
+; SSE-NEXT: paddd %xmm0, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: retq
;
; AVX1-LABEL: test_v32i32:
; AVX1: # %bb.0:
@@ -401,7 +365,8 @@ define i32 @test_v32i32(<32 x i32> %a0)
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -415,7 +380,8 @@ define i32 @test_v32i32(<32 x i32> %a0)
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -443,29 +409,18 @@ define i32 @test_v32i32(<32 x i32> %a0)
;
define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE2-LABEL: test_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: phaddw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v8i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v8i16:
; AVX: # %bb.0:
@@ -473,7 +428,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: # kill: def $ax killed $ax killed $eax
; AVX-NEXT: retq
@@ -484,7 +440,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: retq
@@ -493,31 +450,19 @@ define i16 @test_v8i16(<8 x i16> %a0) {
}
define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE2-LABEL: test_v16i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: phaddw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v16i16:
+; SSE: # %bb.0:
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i16:
; AVX1: # %bb.0:
@@ -527,7 +472,8 @@ define i16 @test_v16i16(<16 x i16> %a0)
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: vzeroupper
@@ -541,7 +487,8 @@ define i16 @test_v16i16(<16 x i16> %a0)
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: vzeroupper
@@ -555,7 +502,8 @@ define i16 @test_v16i16(<16 x i16> %a0)
; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vphaddw %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
; AVX512-NEXT: vzeroupper
@@ -565,35 +513,21 @@ define i16 @test_v16i16(<16 x i16> %a0)
}
define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE2-LABEL: test_v32i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: paddw %xmm3, %xmm1
-; SSE2-NEXT: paddw %xmm2, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: paddw %xmm3, %xmm1
-; SSE41-NEXT: paddw %xmm2, %xmm1
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: phaddw %xmm1, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v32i16:
+; SSE: # %bb.0:
+; SSE-NEXT: paddw %xmm3, %xmm1
+; SSE-NEXT: paddw %xmm2, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
;
; AVX1-LABEL: test_v32i16:
; AVX1: # %bb.0:
@@ -606,7 +540,8 @@ define i16 @test_v32i16(<32 x i16> %a0)
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: vzeroupper
@@ -621,7 +556,8 @@ define i16 @test_v32i16(<32 x i16> %a0)
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: vzeroupper
@@ -648,43 +584,25 @@ define i16 @test_v32i16(<32 x i16> %a0)
}
define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE2-LABEL: test_v64i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: paddw %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm7, %xmm3
-; SSE2-NEXT: paddw %xmm5, %xmm3
-; SSE2-NEXT: paddw %xmm1, %xmm3
-; SSE2-NEXT: paddw %xmm4, %xmm2
-; SSE2-NEXT: paddw %xmm3, %xmm2
-; SSE2-NEXT: paddw %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: paddw %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: paddw %xmm6, %xmm2
-; SSE41-NEXT: paddw %xmm7, %xmm3
-; SSE41-NEXT: paddw %xmm5, %xmm3
-; SSE41-NEXT: paddw %xmm1, %xmm3
-; SSE41-NEXT: paddw %xmm4, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: paddw %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: phaddw %xmm1, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v64i16:
+; SSE: # %bb.0:
+; SSE-NEXT: paddw %xmm6, %xmm2
+; SSE-NEXT: paddw %xmm7, %xmm3
+; SSE-NEXT: paddw %xmm5, %xmm3
+; SSE-NEXT: paddw %xmm1, %xmm3
+; SSE-NEXT: paddw %xmm4, %xmm2
+; SSE-NEXT: paddw %xmm3, %xmm2
+; SSE-NEXT: paddw %xmm0, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
;
; AVX1-LABEL: test_v64i16:
; AVX1: # %bb.0:
@@ -703,7 +621,8 @@ define i16 @test_v64i16(<64 x i16> %a0)
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
; AVX1-NEXT: vzeroupper
@@ -720,7 +639,8 @@ define i16 @test_v64i16(<64 x i16> %a0)
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
; AVX2-NEXT: vzeroupper
Modified: llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll Fri Oct 12 09:41:02 2018
@@ -20,18 +20,20 @@ define float @test_v2f32(float %a0, <2 x
;
; SSE41-LABEL: test_v2f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: haddps %xmm1, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32:
; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm1, %xmm1, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vhaddps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
ret float %1
@@ -50,24 +52,27 @@ define float @test_v4f32(float %a0, <4 x
;
; SSE41-LABEL: test_v4f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT: addps %xmm1, %xmm0
-; SSE41-NEXT: haddps %xmm0, %xmm0
+; SSE41-NEXT: movaps %xmm1, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT: addps %xmm1, %xmm2
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: addps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f32:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
ret float %1
@@ -88,10 +93,11 @@ define float @test_v8f32(float %a0, <8 x
; SSE41-LABEL: test_v8f32:
; SSE41: # %bb.0:
; SSE41-NEXT: addps %xmm2, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT: addps %xmm1, %xmm0
-; SSE41-NEXT: haddps %xmm0, %xmm0
+; SSE41-NEXT: movaps %xmm1, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT: addps %xmm1, %xmm2
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: addps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f32:
@@ -100,7 +106,8 @@ define float @test_v8f32(float %a0, <8 x
; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -111,7 +118,8 @@ define float @test_v8f32(float %a0, <8 x
; AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -138,10 +146,11 @@ define float @test_v16f32(float %a0, <16
; SSE41-NEXT: addps %xmm4, %xmm2
; SSE41-NEXT: addps %xmm3, %xmm1
; SSE41-NEXT: addps %xmm2, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT: addps %xmm1, %xmm0
-; SSE41-NEXT: haddps %xmm0, %xmm0
+; SSE41-NEXT: movaps %xmm1, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT: addps %xmm1, %xmm2
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: addps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f32:
@@ -151,7 +160,8 @@ define float @test_v16f32(float %a0, <16
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -188,17 +198,20 @@ define float @test_v2f32_zero(<2 x float
;
; SSE41-LABEL: test_v2f32_zero:
; SSE41: # %bb.0:
-; SSE41-NEXT: haddps %xmm0, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32_zero:
; AVX512: # %bb.0:
-; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
ret float %1
@@ -220,7 +233,8 @@ define float @test_v4f32_zero(<4 x float
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: haddps %xmm1, %xmm1
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -228,14 +242,16 @@ define float @test_v4f32_zero(<4 x float
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32_zero:
; AVX512: # %bb.0:
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
ret float %1
@@ -259,7 +275,8 @@ define float @test_v8f32_zero(<8 x float
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: haddps %xmm1, %xmm1
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -269,7 +286,8 @@ define float @test_v8f32_zero(<8 x float
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -280,7 +298,8 @@ define float @test_v8f32_zero(<8 x float
; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -310,7 +329,8 @@ define float @test_v16f32_zero(<16 x flo
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: haddps %xmm1, %xmm1
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -321,7 +341,8 @@ define float @test_v16f32_zero(<16 x flo
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -358,17 +379,20 @@ define float @test_v2f32_undef(<2 x floa
;
; SSE41-LABEL: test_v2f32_undef:
; SSE41: # %bb.0:
-; SSE41-NEXT: haddps %xmm0, %xmm0
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f32_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f32_undef:
; AVX512: # %bb.0:
-; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
ret float %1
@@ -390,7 +414,8 @@ define float @test_v4f32_undef(<4 x floa
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: haddps %xmm1, %xmm1
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -398,14 +423,16 @@ define float @test_v4f32_undef(<4 x floa
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
ret float %1
@@ -429,7 +456,8 @@ define float @test_v8f32_undef(<8 x floa
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: haddps %xmm1, %xmm1
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -439,7 +467,8 @@ define float @test_v8f32_undef(<8 x floa
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -450,7 +479,8 @@ define float @test_v8f32_undef(<8 x floa
; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -480,7 +510,8 @@ define float @test_v16f32_undef(<16 x fl
; SSE41-NEXT: movaps %xmm0, %xmm1
; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE41-NEXT: addps %xmm0, %xmm1
-; SSE41-NEXT: haddps %xmm1, %xmm1
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT: addps %xmm0, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -491,7 +522,8 @@ define float @test_v16f32_undef(<16 x fl
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -518,53 +550,43 @@ define float @test_v16f32_undef(<16 x fl
;
define double @test_v2f64(double %a0, <2 x double> %a1) {
-; SSE2-LABEL: test_v2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: haddpd %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64:
; AVX: # %bb.0:
-; AVX-NEXT: vhaddpd %xmm1, %xmm1, %xmm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vhaddpd %xmm1, %xmm1, %xmm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
ret double %1
}
define double @test_v4f64(double %a0, <4 x double> %a1) {
-; SSE2-LABEL: test_v4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: addpd %xmm2, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: addpd %xmm2, %xmm0
-; SSE41-NEXT: haddpd %xmm0, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: addpd %xmm2, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -573,7 +595,8 @@ define double @test_v4f64(double %a0, <4
; AVX512: # %bb.0:
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX512-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -582,31 +605,23 @@ define double @test_v4f64(double %a0, <4
}
define double @test_v8f64(double %a0, <8 x double> %a1) {
-; SSE2-LABEL: test_v8f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: addpd %xmm4, %xmm2
-; SSE2-NEXT: addpd %xmm3, %xmm1
-; SSE2-NEXT: addpd %xmm2, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: addpd %xmm4, %xmm2
-; SSE41-NEXT: addpd %xmm3, %xmm0
-; SSE41-NEXT: addpd %xmm2, %xmm0
-; SSE41-NEXT: haddpd %xmm0, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v8f64:
+; SSE: # %bb.0:
+; SSE-NEXT: addpd %xmm4, %xmm2
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: addpd %xmm2, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f64:
; AVX: # %bb.0:
; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -627,32 +642,19 @@ define double @test_v8f64(double %a0, <8
}
define double @test_v16f64(double %a0, <16 x double> %a1) {
-; SSE2-LABEL: test_v16f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: addpd %xmm6, %xmm2
-; SSE2-NEXT: addpd %xmm7, %xmm3
-; SSE2-NEXT: addpd %xmm5, %xmm1
-; SSE2-NEXT: addpd %xmm3, %xmm1
-; SSE2-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: addpd %xmm2, %xmm4
-; SSE2-NEXT: addpd %xmm1, %xmm4
-; SSE2-NEXT: movapd %xmm4, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: addpd %xmm4, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: addpd %xmm6, %xmm2
-; SSE41-NEXT: addpd %xmm7, %xmm3
-; SSE41-NEXT: addpd %xmm5, %xmm1
-; SSE41-NEXT: addpd %xmm3, %xmm1
-; SSE41-NEXT: addpd {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT: addpd %xmm2, %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: haddpd %xmm0, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v16f64:
+; SSE: # %bb.0:
+; SSE-NEXT: addpd %xmm6, %xmm2
+; SSE-NEXT: addpd %xmm7, %xmm3
+; SSE-NEXT: addpd %xmm5, %xmm1
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: addpd %xmm2, %xmm4
+; SSE-NEXT: addpd %xmm1, %xmm4
+; SSE-NEXT: movapd %xmm4, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE-NEXT: addpd %xmm4, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v16f64:
; AVX: # %bb.0:
@@ -661,7 +663,8 @@ define double @test_v16f64(double %a0, <
; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -687,53 +690,45 @@ define double @test_v16f64(double %a0, <
;
define double @test_v2f64_zero(<2 x double> %a0) {
-; SSE2-LABEL: test_v2f64_zero:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2f64_zero:
-; SSE41: # %bb.0:
-; SSE41-NEXT: haddpd %xmm0, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v2f64_zero:
+; SSE: # %bb.0:
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: addpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64_zero:
; AVX512: # %bb.0:
-; AVX512-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
ret double %1
}
define double @test_v4f64_zero(<4 x double> %a0) {
-; SSE2-LABEL: test_v4f64_zero:
-; SSE2: # %bb.0:
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4f64_zero:
-; SSE41: # %bb.0:
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: haddpd %xmm0, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v4f64_zero:
+; SSE: # %bb.0:
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: addpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -742,7 +737,8 @@ define double @test_v4f64_zero(<4 x doub
; AVX512: # %bb.0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -751,31 +747,24 @@ define double @test_v4f64_zero(<4 x doub
}
define double @test_v8f64_zero(<8 x double> %a0) {
-; SSE2-LABEL: test_v8f64_zero:
-; SSE2: # %bb.0:
-; SSE2-NEXT: addpd %xmm3, %xmm1
-; SSE2-NEXT: addpd %xmm2, %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8f64_zero:
-; SSE41: # %bb.0:
-; SSE41-NEXT: addpd %xmm3, %xmm1
-; SSE41-NEXT: addpd %xmm2, %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: haddpd %xmm0, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v8f64_zero:
+; SSE: # %bb.0:
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: addpd %xmm2, %xmm0
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: addpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f64_zero:
; AVX: # %bb.0:
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -796,32 +785,19 @@ define double @test_v8f64_zero(<8 x doub
}
define double @test_v16f64_zero(<16 x double> %a0) {
-; SSE2-LABEL: test_v16f64_zero:
-; SSE2: # %bb.0:
-; SSE2-NEXT: addpd %xmm6, %xmm2
-; SSE2-NEXT: addpd %xmm4, %xmm0
-; SSE2-NEXT: addpd %xmm2, %xmm0
-; SSE2-NEXT: addpd %xmm7, %xmm3
-; SSE2-NEXT: addpd %xmm5, %xmm1
-; SSE2-NEXT: addpd %xmm3, %xmm1
-; SSE2-NEXT: addpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16f64_zero:
-; SSE41: # %bb.0:
-; SSE41-NEXT: addpd %xmm6, %xmm2
-; SSE41-NEXT: addpd %xmm4, %xmm0
-; SSE41-NEXT: addpd %xmm2, %xmm0
-; SSE41-NEXT: addpd %xmm7, %xmm3
-; SSE41-NEXT: addpd %xmm5, %xmm1
-; SSE41-NEXT: addpd %xmm3, %xmm1
-; SSE41-NEXT: addpd %xmm0, %xmm1
-; SSE41-NEXT: haddpd %xmm1, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v16f64_zero:
+; SSE: # %bb.0:
+; SSE-NEXT: addpd %xmm6, %xmm2
+; SSE-NEXT: addpd %xmm4, %xmm0
+; SSE-NEXT: addpd %xmm2, %xmm0
+; SSE-NEXT: addpd %xmm7, %xmm3
+; SSE-NEXT: addpd %xmm5, %xmm1
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: addpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v16f64_zero:
; AVX: # %bb.0:
@@ -830,7 +806,8 @@ define double @test_v16f64_zero(<16 x do
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -856,53 +833,45 @@ define double @test_v16f64_zero(<16 x do
;
define double @test_v2f64_undef(<2 x double> %a0) {
-; SSE2-LABEL: test_v2f64_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2f64_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: haddpd %xmm0, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v2f64_undef:
+; SSE: # %bb.0:
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: addpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_undef:
; AVX: # %bb.0:
-; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64_undef:
; AVX512: # %bb.0:
-; AVX512-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
ret double %1
}
define double @test_v4f64_undef(<4 x double> %a0) {
-; SSE2-LABEL: test_v4f64_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4f64_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: haddpd %xmm0, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v4f64_undef:
+; SSE: # %bb.0:
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: addpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_undef:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -911,7 +880,8 @@ define double @test_v4f64_undef(<4 x dou
; AVX512: # %bb.0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -920,31 +890,24 @@ define double @test_v4f64_undef(<4 x dou
}
define double @test_v8f64_undef(<8 x double> %a0) {
-; SSE2-LABEL: test_v8f64_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: addpd %xmm3, %xmm1
-; SSE2-NEXT: addpd %xmm2, %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: movapd %xmm0, %xmm1
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: addpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8f64_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: addpd %xmm3, %xmm1
-; SSE41-NEXT: addpd %xmm2, %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: haddpd %xmm0, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v8f64_undef:
+; SSE: # %bb.0:
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: addpd %xmm2, %xmm0
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: addpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f64_undef:
; AVX: # %bb.0:
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -965,32 +928,19 @@ define double @test_v8f64_undef(<8 x dou
}
define double @test_v16f64_undef(<16 x double> %a0) {
-; SSE2-LABEL: test_v16f64_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: addpd %xmm6, %xmm2
-; SSE2-NEXT: addpd %xmm4, %xmm0
-; SSE2-NEXT: addpd %xmm2, %xmm0
-; SSE2-NEXT: addpd %xmm7, %xmm3
-; SSE2-NEXT: addpd %xmm5, %xmm1
-; SSE2-NEXT: addpd %xmm3, %xmm1
-; SSE2-NEXT: addpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16f64_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: addpd %xmm6, %xmm2
-; SSE41-NEXT: addpd %xmm4, %xmm0
-; SSE41-NEXT: addpd %xmm2, %xmm0
-; SSE41-NEXT: addpd %xmm7, %xmm3
-; SSE41-NEXT: addpd %xmm5, %xmm1
-; SSE41-NEXT: addpd %xmm3, %xmm1
-; SSE41-NEXT: addpd %xmm0, %xmm1
-; SSE41-NEXT: haddpd %xmm1, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: test_v16f64_undef:
+; SSE: # %bb.0:
+; SSE-NEXT: addpd %xmm6, %xmm2
+; SSE-NEXT: addpd %xmm4, %xmm0
+; SSE-NEXT: addpd %xmm2, %xmm0
+; SSE-NEXT: addpd %xmm7, %xmm3
+; SSE-NEXT: addpd %xmm5, %xmm1
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: addpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: test_v16f64_undef:
; AVX: # %bb.0:
@@ -999,7 +949,8 @@ define double @test_v16f64_undef(<16 x d
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll?rev=344361&r1=344360&r2=344361&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll Fri Oct 12 09:41:02 2018
@@ -2700,36 +2700,21 @@ define <4 x i32> @combine_constant_inser
}
define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: PR22377:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
-; SSE2-NEXT: addps %xmm0, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: PR22377:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movaps %xmm0, %xmm1
-; SSSE3-NEXT: haddps %xmm0, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: PR22377:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movaps %xmm0, %xmm1
-; SSE41-NEXT: haddps %xmm0, %xmm1
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: PR22377:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; SSE-NEXT: addps %xmm0, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: retq
;
; AVX-LABEL: PR22377:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX-NEXT: retq
entry:
%s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
More information about the llvm-commits
mailing list