[llvm] r318983 - [X86] Don't report gather is legal on Skylake CPUs when AVX2/AVX512 is disabled. Allow gather on SKX/CNL/ICL when AVX512 is disabled by using AVX2 instructions.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 25 10:09:37 PST 2017
Author: ctopper
Date: Sat Nov 25 10:09:37 2017
New Revision: 318983
URL: http://llvm.org/viewvc/llvm-project?rev=318983&view=rev
Log:
[X86] Don't report gather is legal on Skylake CPUs when AVX2/AVX512 is disabled. Allow gather on SKX/CNL/ICL when AVX512 is disabled by using AVX2 instructions.
Summary:
This adds a new fast gather feature bit to cover all CPUs that support fast gather that we can use independent of whether the AVX512 feature is enabled. I'm only using this new bit to qualify AVX2 codegen. AVX512 is still implicitly assuming fast gather to keep tests working and to match the scatter behavior.
Test command lines have been added for these two cases.
Reviewers: magabari, delena, RKSimon, zvi
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D40282
Modified:
llvm/trunk/lib/Target/X86/X86.td
llvm/trunk/lib/Target/X86/X86Subtarget.cpp
llvm/trunk/lib/Target/X86/X86Subtarget.h
llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll
Modified: llvm/trunk/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=318983&r1=318982&r2=318983&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86.td (original)
+++ llvm/trunk/lib/Target/X86/X86.td Sat Nov 25 10:09:37 2017
@@ -308,6 +308,14 @@ def FeatureMacroFusion
: SubtargetFeature<"macrofusion", "HasMacroFusion", "true",
"Various instructions can be fused with conditional branches">;
+// Gather is available since Haswell (AVX2 set). So technically, we can
+// generate Gathers on all AVX2 processors. But the overhead on HSW is high.
+// Skylake Client processor has faster Gathers than HSW and performance is
+// similar to Skylake Server (AVX-512).
+def FeatureHasFastGather
+ : SubtargetFeature<"fast-gather", "HasFastGather", "true",
+ "Indicates if gather is reasonably fast.">;
+
//===----------------------------------------------------------------------===//
// X86 processors supported.
//===----------------------------------------------------------------------===//
@@ -613,7 +621,8 @@ def SKLFeatures : ProcessorFeatures<BDWF
class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
SKLFeatures.Value, [
- ProcIntelSKL
+ ProcIntelSKL,
+ FeatureHasFastGather
]>;
def : SkylakeClientProc<"skylake">;
@@ -637,7 +646,8 @@ class KnightsLandingProc<string Name> :
KNLFeatures.Value, [
ProcIntelKNL,
FeatureSlowTwoMemOps,
- FeatureFastPartialYMMorZMMWrite
+ FeatureFastPartialYMMorZMMWrite,
+ FeatureHasFastGather
]>;
def : KnightsLandingProc<"knl">;
@@ -646,6 +656,7 @@ class KnightsMillProc<string Name> : Pro
ProcIntelKNL,
FeatureSlowTwoMemOps,
FeatureFastPartialYMMorZMMWrite,
+ FeatureHasFastGather,
FeatureVPOPCNTDQ
]>;
def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features
@@ -662,7 +673,8 @@ def SKXFeatures : ProcessorFeatures<SKLF
class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
SKXFeatures.Value, [
- ProcIntelSKX
+ ProcIntelSKX,
+ FeatureHasFastGather
]>;
def : SkylakeServerProc<"skylake-avx512">;
def : SkylakeServerProc<"skx">; // Legacy alias.
@@ -675,7 +687,8 @@ def CNLFeatures : ProcessorFeatures<SKXF
class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
CNLFeatures.Value, [
- ProcIntelCNL
+ ProcIntelCNL,
+ FeatureHasFastGather
]>;
def : CannonlakeProc<"cannonlake">;
@@ -691,7 +704,8 @@ def ICLFeatures : ProcessorFeatures<CNLF
class IcelakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
ICLFeatures.Value, [
- ProcIntelICL
+ ProcIntelICL,
+ FeatureHasFastGather
]>;
def : IcelakeProc<"icelake">;
Modified: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.cpp?rev=318983&r1=318982&r2=318983&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp Sat Nov 25 10:09:37 2017
@@ -270,14 +270,13 @@ void X86Subtarget::initSubtargetFeatures
isTargetKFreeBSD() || In64BitMode)
stackAlignment = 16;
- // Gather is available since Haswell (AVX2 set). So technically, we can
- // generate Gathers on all AVX2 processors. But the overhead on HSW is high.
- // Skylake Client processor has faster Gathers than HSW and performance is
- // similar to Skylake Server (AVX-512). The specified overhead is relative to
- // the Load operation. "2" is the number provided by Intel architects. This
+ // Some CPUs have more overhead for gather. The specified overhead is relative
+ // to the Load operation. "2" is the number provided by Intel architects. This
// parameter is used for cost estimation of Gather Op and comparison with
// other alternatives.
- if (X86ProcFamily == IntelSkylake || hasAVX512())
+ // TODO: Remove the explicit hasAVX512()?, That would mean we would only
+ // enable gather with a -march.
+ if (hasAVX512() || (hasAVX2() && hasFastGather()))
GatherOverhead = 2;
if (hasAVX512())
ScatterOverhead = 2;
@@ -345,6 +344,7 @@ void X86Subtarget::initializeEnvironment
HasCmpxchg16b = false;
UseLeaForSP = false;
HasFastPartialYMMorZMMWrite = false;
+ HasFastGather = false;
HasFastScalarFSQRT = false;
HasFastVectorFSQRT = false;
HasFastLZCNT = false;
Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=318983&r1=318982&r2=318983&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Sat Nov 25 10:09:37 2017
@@ -229,6 +229,10 @@ protected:
/// of a YMM or ZMM register without clearing the upper part.
bool HasFastPartialYMMorZMMWrite;
+ /// True if gather is reasonably fast. This is true for Skylake client and
+ /// all AVX-512 CPUs.
+ bool HasFastGather;
+
/// True if hardware SQRTSS instruction is at least as fast (latency) as
/// RSQRTSS followed by a Newton-Raphson iteration.
bool HasFastScalarFSQRT;
@@ -514,6 +518,7 @@ public:
bool hasFastPartialYMMorZMMWrite() const {
return HasFastPartialYMMorZMMWrite;
}
+ bool hasFastGather() const { return HasFastGather; }
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasFastLZCNT() const { return HasFastLZCNT; }
Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=318983&r1=318982&r2=318983&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Sat Nov 25 10:09:37 2017
@@ -2518,9 +2518,11 @@ bool X86TTIImpl::isLegalMaskedGather(Typ
int DataWidth = isa<PointerType>(ScalarTy) ?
DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
- // AVX-512 and Skylake AVX2 allows gather and scatter
- return (DataWidth == 32 || DataWidth == 64) && (ST->hasAVX512() ||
- ST->getProcFamily() == X86Subtarget::IntelSkylake);
+ // Some CPUs have better gather performance than others.
+ // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
+ // enable gather with a -march.
+ return (DataWidth == 32 || DataWidth == 64) &&
+ (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
}
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
Modified: llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll?rev=318983&r1=318982&r2=318983&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-masked-gather.ll Sat Nov 25 10:09:37 2017
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mcpu=skylake -mtriple=i386-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X86 %s
; RUN: llc < %s -mcpu=skylake -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mcpu=skx -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2,-avx512f | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mcpu=skylake -mtriple=x86_64-unknown-linux-gnu -mattr=-avx2 | FileCheck --check-prefix=NOGATHER %s
declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i32> %passthro)
@@ -23,6 +25,29 @@ define <2 x i32> @masked_gather_v2i32(<2
; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
; X64-NEXT: vpmovsxdq %xmm1, %xmm0
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2i32:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %XMM2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB0_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB0_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB0_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: movl (%rax), %eax
+; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2
+; NOGATHER-NEXT: .LBB0_4: # %else2
+; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: retq
entry:
%ld = load <2 x i32*>, <2 x i32*>* %ptr
%res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro)
@@ -50,6 +75,30 @@ define <4 x i32> @masked_gather_v2i32_co
; X64-NEXT: vpmovsxdq %xmm1, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2i32_concat:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %XMM2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB1_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB1_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB1_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: movl (%rax), %eax
+; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2
+; NOGATHER-NEXT: .LBB1_4: # %else2
+; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOGATHER-NEXT: retq
entry:
%ld = load <2 x i32*>, <2 x i32*>* %ptr
%res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro)
@@ -77,6 +126,29 @@ define <2 x float> @masked_gather_v2floa
; X64-NEXT: vmovaps %xmm1, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2float:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %XMM2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB2_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB2_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB2_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; NOGATHER-NEXT: .LBB2_4: # %else2
+; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: retq
entry:
%ld = load <2 x float*>, <2 x float*>* %ptr
%res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro)
@@ -101,6 +173,29 @@ define <4 x float> @masked_gather_v2floa
; X64-NEXT: vmovaps %xmm1, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2float_concat:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %XMM2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB3_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB3_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB3_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; NOGATHER-NEXT: .LBB3_4: # %else2
+; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: retq
entry:
%ld = load <2 x float*>, <2 x float*>* %ptr
%res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro)
@@ -124,6 +219,44 @@ define <4 x i32> @masked_gather_v4i32(<4
; X64-NEXT: vmovdqa %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v4i32:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vpextrb $0, %xmm1, %eax
+; NOGATHER-NEXT: # implicit-def: %XMM3
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB4_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm0, %rax
+; NOGATHER-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB4_2: # %else
+; NOGATHER-NEXT: vpextrb $4, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB4_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT: .LBB4_4: # %else2
+; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB4_6
+; NOGATHER-NEXT: # BB#5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT: .LBB4_6: # %else5
+; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB4_8
+; NOGATHER-NEXT: # BB#7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT: .LBB4_8: # %else8
+; NOGATHER-NEXT: vpslld $31, %xmm1, %xmm0
+; NOGATHER-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; NOGATHER-NEXT: vzeroupper
+; NOGATHER-NEXT: retq
entry:
%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 0, <4 x i1> %masks, <4 x i32> %passthro)
ret <4 x i32> %res
@@ -144,6 +277,44 @@ define <4 x float> @masked_gather_v4floa
; X64-NEXT: vmovaps %xmm2, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v4float:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vpextrb $0, %xmm1, %eax
+; NOGATHER-NEXT: # implicit-def: %XMM3
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB5_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm0, %rax
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB5_2: # %else
+; NOGATHER-NEXT: vpextrb $4, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB5_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; NOGATHER-NEXT: .LBB5_4: # %else2
+; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB5_6
+; NOGATHER-NEXT: # BB#5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; NOGATHER-NEXT: .LBB5_6: # %else5
+; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB5_8
+; NOGATHER-NEXT: # BB#7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
+; NOGATHER-NEXT: .LBB5_8: # %else8
+; NOGATHER-NEXT: vpslld $31, %xmm1, %xmm0
+; NOGATHER-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; NOGATHER-NEXT: vzeroupper
+; NOGATHER-NEXT: retq
entry:
%res = call <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 0, <4 x i1> %masks, <4 x float> %passthro)
ret <4 x float> %res
@@ -174,6 +345,90 @@ define <8 x i32> @masked_gather_v8i32(<8
; X64-NEXT: vpgatherqd %xmm0, (,%ymm2), %xmm1
; X64-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm0
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v8i32:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %ymm4
+; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %YMM2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB6_2: # %else
+; NOGATHER-NEXT: vpextrb $2, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm4, %rax
+; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm5
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB6_4: # %else2
+; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_6
+; NOGATHER-NEXT: # BB#5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm4, %xmm5
+; NOGATHER-NEXT: vmovq %xmm5, %rax
+; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm5
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB6_6: # %else5
+; NOGATHER-NEXT: vpextrb $6, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_8
+; NOGATHER-NEXT: # BB#7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm4, %xmm4
+; NOGATHER-NEXT: vpextrq $1, %xmm4, %rax
+; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm4
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB6_8: # %else8
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_10
+; NOGATHER-NEXT: # BB#9: # %cond.load10
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm4, %xmm4
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB6_10: # %else11
+; NOGATHER-NEXT: vpextrb $10, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_12
+; NOGATHER-NEXT: # BB#11: # %cond.load13
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm4, %xmm4
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB6_12: # %else14
+; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_14
+; NOGATHER-NEXT: # BB#13: # %cond.load16
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm4, %xmm4
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB6_14: # %else17
+; NOGATHER-NEXT: vpextrb $14, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_16
+; NOGATHER-NEXT: # BB#15: # %cond.load19
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB6_16: # %else20
+; NOGATHER-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; NOGATHER-NEXT: vpslld $31, %xmm3, %xmm3
+; NOGATHER-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; NOGATHER-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT: retq
entry:
%ld = load <8 x i32*>, <8 x i32*>* %ptr
%res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ld, i32 0, <8 x i1> %masks, <8 x i32> %passthro)
@@ -205,6 +460,91 @@ define <8 x float> @masked_gather_v8floa
; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
; X64-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm0
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v8float:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %ymm4
+; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %YMM2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB7_2: # %else
+; NOGATHER-NEXT: vpextrb $2, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm4, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0],mem[0],xmm2[2,3]
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB7_4: # %else2
+; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_6
+; NOGATHER-NEXT: # BB#5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm4, %xmm5
+; NOGATHER-NEXT: vmovq %xmm5, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0,1],mem[0],xmm2[3]
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB7_6: # %else5
+; NOGATHER-NEXT: vpextrb $6, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_8
+; NOGATHER-NEXT: # BB#7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm4, %xmm4
+; NOGATHER-NEXT: vpextrq $1, %xmm4, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB7_8: # %else8
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_10
+; NOGATHER-NEXT: # BB#9: # %cond.load10
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm5
+; NOGATHER-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB7_10: # %else11
+; NOGATHER-NEXT: vpextrb $10, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_12
+; NOGATHER-NEXT: # BB#11: # %cond.load13
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB7_12: # %else14
+; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_14
+; NOGATHER-NEXT: # BB#13: # %cond.load16
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB7_14: # %else17
+; NOGATHER-NEXT: vpextrb $14, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_16
+; NOGATHER-NEXT: # BB#15: # %cond.load19
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB7_16: # %else20
+; NOGATHER-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; NOGATHER-NEXT: vpslld $31, %xmm3, %xmm3
+; NOGATHER-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; NOGATHER-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT: retq
entry:
%ld = load <8 x float*>, <8 x float*>* %ptr
%res = call <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ld, i32 0, <8 x i1> %masks, <8 x float> %passthro)
@@ -234,6 +574,54 @@ define <4 x i64> @masked_gather_v4i64(<4
; X64-NEXT: vpgatherqq %ymm0, (,%ymm2), %ymm1
; X64-NEXT: vmovdqa %ymm1, %ymm0
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v4i64:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %YMM2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB8_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT: .LBB8_2: # %else
+; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB8_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm4
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB8_4: # %else2
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB8_6
+; NOGATHER-NEXT: # BB#5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm4, %xmm4
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB8_6: # %else5
+; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB8_8
+; NOGATHER-NEXT: # BB#7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB8_8: # %else8
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vpsrad $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vpmovsxdq %xmm0, %xmm3
+; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; NOGATHER-NEXT: vpmovsxdq %xmm0, %xmm0
+; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; NOGATHER-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT: retq
entry:
%ld = load <4 x i64*>, <4 x i64*>* %ptr
%res = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ld, i32 0, <4 x i1> %masks, <4 x i64> %passthro)
@@ -263,6 +651,54 @@ define <4 x double> @masked_gather_v4dou
; X64-NEXT: vgatherqpd %ymm0, (,%ymm2), %ymm1
; X64-NEXT: vmovapd %ymm1, %ymm0
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v4double:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %YMM2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB9_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT: .LBB9_2: # %else
+; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB9_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
+; NOGATHER-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
+; NOGATHER-NEXT: .LBB9_4: # %else2
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB9_6
+; NOGATHER-NEXT: # BB#5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB9_6: # %else5
+; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB9_8
+; NOGATHER-NEXT: # BB#7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB9_8: # %else8
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vpsrad $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vpmovsxdq %xmm0, %xmm3
+; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; NOGATHER-NEXT: vpmovsxdq %xmm0, %xmm0
+; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; NOGATHER-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT: retq
entry:
%ld = load <4 x double*>, <4 x double*>* %ptr
%res = call <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ld, i32 0, <4 x i1> %masks, <4 x double> %passthro)
@@ -286,6 +722,28 @@ define <2 x i64> @masked_gather_v2i64(<2
; X64-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2i64:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %XMM2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB10_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT: .LBB10_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB10_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2
+; NOGATHER-NEXT: .LBB10_4: # %else2
+; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: retq
entry:
%ld = load <2 x i64*>, <2 x i64*>* %ptr
%res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ld, i32 0, <2 x i1> %masks, <2 x i64> %passthro)
@@ -309,6 +767,28 @@ define <2 x double> @masked_gather_v2dou
; X64-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1
; X64-NEXT: vmovapd %xmm1, %xmm0
; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2double:
+; NOGATHER: # BB#0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %XMM2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB11_2
+; NOGATHER-NEXT: # BB#1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT: .LBB11_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB11_4
+; NOGATHER-NEXT: # BB#3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; NOGATHER-NEXT: .LBB11_4: # %else2
+; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: retq
entry:
%ld = load <2 x double*>, <2 x double*>* %ptr
%res = call <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ld, i32 0, <2 x i1> %masks, <2 x double> %passthro)
More information about the llvm-commits
mailing list