[llvm] r304718 - [X86][AVX1] Split 256-bit vector non-temporal loads to keep it non-temporal (PR32744)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 5 09:02:01 PDT 2017
Author: rksimon
Date: Mon Jun 5 11:02:01 2017
New Revision: 304718
URL: http://llvm.org/viewvc/llvm-project?rev=304718&view=rev
Log:
[X86][AVX1] Split 256-bit vector non-temporal loads to keep it non-temporal (PR32744)
Differential Revision: https://reviews.llvm.org/D33728
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/fast-isel-nontemporal.ll
llvm/trunk/test/CodeGen/X86/nontemporal-loads.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=304718&r1=304717&r2=304718&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Jun 5 11:02:01 2017
@@ -6391,6 +6391,7 @@ static SDValue LowerAsSplatVectorLoad(SD
/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
bool isAfterLegalize) {
unsigned NumElems = Elts.size();
@@ -6495,6 +6496,12 @@ static SDValue EltsFromConsecutiveLoads(
if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
+ // Don't create 256-bit non-temporal aligned loads without AVX2 as these
+ // will lower to regular temporal loads and use the cache.
+ if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
+ VT.is256BitVector() && !Subtarget.hasInt256())
+ return SDValue();
+
if (IsConsecutiveLoad)
return CreateLoad(VT, LDBase);
@@ -7701,7 +7708,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDV
// See if we can use a vector load to get all of the elements.
if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
- if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
+ if (SDValue LD =
+ EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
return LD;
}
@@ -28784,7 +28792,8 @@ static SDValue combineShuffle(SDNode *N,
}
if (Elts.size() == VT.getVectorNumElements())
- if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
+ if (SDValue LD =
+ EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
return LD;
// For AVX2, we sometimes want to combine
@@ -32377,15 +32386,17 @@ static SDValue combineLoad(SDNode *N, Se
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// For chips with slow 32-byte unaligned loads, break the 32-byte operation
- // into two 16-byte operations.
+ // into two 16-byte operations. Also split non-temporal aligned loads on
+ // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
ISD::LoadExtType Ext = Ld->getExtensionType();
bool Fast;
unsigned AddressSpace = Ld->getAddressSpace();
unsigned Alignment = Ld->getAlignment();
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
Ext == ISD::NON_EXTLOAD &&
- TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
- AddressSpace, Alignment, &Fast) && !Fast) {
+ ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
+ (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
+ AddressSpace, Alignment, &Fast) && !Fast))) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
@@ -35093,7 +35104,8 @@ static SDValue combineInsertSubvector(SD
if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
OpVT, AS, Alignment, &Fast) && Fast) {
SDValue Ops[] = {SubVec2, SubVec};
- if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+ if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
+ Subtarget, false))
return Ld;
}
}
Modified: llvm/trunk/test/CodeGen/X86/fast-isel-nontemporal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fast-isel-nontemporal.ll?rev=304718&r1=304717&r2=304718&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fast-isel-nontemporal.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fast-isel-nontemporal.ll Mon Jun 5 11:02:01 2017
@@ -957,8 +957,16 @@ define <16 x float> @test_load_nt16xfloa
;
; AVX1-LABEL: test_load_nt16xfloat:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm0, %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm2, %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt16xfloat:
@@ -1003,8 +1011,16 @@ define <8 x double> @test_load_nt8xdoubl
;
; AVX1-LABEL: test_load_nt8xdouble:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovapd (%rdi), %ymm0
-; AVX1-NEXT: vmovapd 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm0, %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm2, %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt8xdouble:
@@ -1049,8 +1065,16 @@ define <64 x i8> @test_load_nt64xi8(<64
;
; AVX1-LABEL: test_load_nt64xi8:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm0, %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm2, %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt64xi8:
@@ -1101,8 +1125,16 @@ define <32 x i16> @test_load_nt32xi16(<3
;
; AVX1-LABEL: test_load_nt32xi16:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm0, %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm2, %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt32xi16:
@@ -1153,8 +1185,16 @@ define <16 x i32> @test_load_nt16xi32(<1
;
; AVX1-LABEL: test_load_nt16xi32:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm0, %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm2, %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt16xi32:
@@ -1199,8 +1239,16 @@ define <8 x i64> @test_load_nt8xi64(<8 x
;
; AVX1-LABEL: test_load_nt8xi64:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm0, %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: vmovaps %xmm2, %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt8xi64:
Modified: llvm/trunk/test/CodeGen/X86/nontemporal-loads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-loads.ll?rev=304718&r1=304717&r2=304718&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-loads.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-loads.ll Mon Jun 5 11:02:01 2017
@@ -168,7 +168,9 @@ define <8 x float> @test_v8f32(<8 x floa
;
; AVX1-LABEL: test_v8f32:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8f32:
@@ -199,7 +201,9 @@ define <8 x i32> @test_v8i32(<8 x i32>*
;
; AVX1-LABEL: test_v8i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8i32:
@@ -240,7 +244,9 @@ define <4 x double> @test_v4f64(<4 x dou
;
; AVX1-LABEL: test_v4f64:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v4f64:
@@ -271,7 +277,9 @@ define <4 x i64> @test_v4i64(<4 x i64>*
;
; AVX1-LABEL: test_v4i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v4i64:
@@ -302,7 +310,9 @@ define <16 x i16> @test_v16i16(<16 x i16
;
; AVX1-LABEL: test_v16i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v16i16:
@@ -333,7 +343,9 @@ define <32 x i8> @test_v32i8(<32 x i8>*
;
; AVX1-LABEL: test_v32i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v32i8:
@@ -370,8 +382,12 @@ define <16 x float> @test_v16f32(<16 x f
;
; AVX1-LABEL: test_v16f32:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v16f32:
@@ -407,8 +423,12 @@ define <16 x i32> @test_v16i32(<16 x i32
;
; AVX1-LABEL: test_v16i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v16i32:
@@ -444,8 +464,12 @@ define <8 x double> @test_v8f64(<8 x dou
;
; AVX1-LABEL: test_v8f64:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8f64:
@@ -481,8 +505,12 @@ define <8 x i64> @test_v8i64(<8 x i64>*
;
; AVX1-LABEL: test_v8i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8i64:
@@ -518,8 +546,12 @@ define <32 x i16> @test_v32i16(<32 x i16
;
; AVX1-LABEL: test_v32i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v32i16:
@@ -567,8 +599,12 @@ define <64 x i8> @test_v64i8(<64 x i8>*
;
; AVX1-LABEL: test_v64i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v64i8:
@@ -743,7 +779,10 @@ define <8 x float> @test_arg_v8f32(<8 x
;
; AVX1-LABEL: test_arg_v8f32:
; AVX1: # BB#0:
-; AVX1-NEXT: vaddps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v8f32:
@@ -771,10 +810,10 @@ define <8 x i32> @test_arg_v8i32(<8 x i3
;
; AVX1-LABEL: test_arg_v8i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -815,7 +854,10 @@ define <4 x double> @test_arg_v4f64(<4 x
;
; AVX1-LABEL: test_arg_v4f64:
; AVX1: # BB#0:
-; AVX1-NEXT: vaddpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v4f64:
@@ -843,10 +885,10 @@ define <4 x i64> @test_arg_v4i64(<4 x i6
;
; AVX1-LABEL: test_arg_v4i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -876,10 +918,10 @@ define <16 x i16> @test_arg_v16i16(<16 x
;
; AVX1-LABEL: test_arg_v16i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -909,10 +951,10 @@ define <32 x i8> @test_arg_v32i8(<32 x i
;
; AVX1-LABEL: test_arg_v32i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -946,8 +988,14 @@ define <16 x float> @test_arg_v16f32(<16
;
; AVX1-LABEL: test_arg_v16f32:
; AVX1: # BB#0:
-; AVX1-NEXT: vaddps (%rdi), %ymm0, %ymm0
-; AVX1-NEXT: vaddps 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm3
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v16f32:
@@ -979,18 +1027,18 @@ define <16 x i32> @test_arg_v16i32(<16 x
;
; AVX1-LABEL: test_arg_v16i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm2
-; AVX1-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm4
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v16i32:
@@ -1022,8 +1070,14 @@ define <8 x double> @test_arg_v8f64(<8 x
;
; AVX1-LABEL: test_arg_v8f64:
; AVX1: # BB#0:
-; AVX1-NEXT: vaddpd (%rdi), %ymm0, %ymm0
-; AVX1-NEXT: vaddpd 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm3
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v8f64:
@@ -1055,18 +1109,18 @@ define <8 x i64> @test_arg_v8i64(<8 x i6
;
; AVX1-LABEL: test_arg_v8i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm2
-; AVX1-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm4
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v8i64:
@@ -1098,18 +1152,18 @@ define <32 x i16> @test_arg_v32i16(<32 x
;
; AVX1-LABEL: test_arg_v32i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm2
-; AVX1-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpaddw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm4
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpaddw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpaddw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v32i16:
@@ -1157,18 +1211,18 @@ define <64 x i8> @test_arg_v64i8(<64 x i
;
; AVX1-LABEL: test_arg_v64i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm2
-; AVX1-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm4
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v64i8:
More information about the llvm-commits
mailing list