[llvm] r296859 - [X86] Generate VZEROUPPER for Skylake-avx512.
Amjad Aboud via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 3 01:03:25 PST 2017
Author: aaboud
Date: Fri Mar 3 03:03:24 2017
New Revision: 296859
URL: http://llvm.org/viewvc/llvm-project?rev=296859&view=rev
Log:
[X86] Generate VZEROUPPER for Skylake-avx512.
VZEROUPPER should not be issued on Knights Landing (KNL), but on Skylake-avx512 it should be.
Differential Revision: https://reviews.llvm.org/D29874
Modified:
llvm/trunk/lib/Target/X86/X86.td
llvm/trunk/lib/Target/X86/X86Subtarget.cpp
llvm/trunk/lib/Target/X86/X86Subtarget.h
llvm/trunk/lib/Target/X86/X86VZeroUpper.cpp
llvm/trunk/test/CodeGen/X86/avg.ll
llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll
llvm/trunk/test/CodeGen/X86/avx-vzeroupper.ll
llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll
llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll
llvm/trunk/test/CodeGen/X86/avx512-any_extend_load.ll
llvm/trunk/test/CodeGen/X86/avx512-arith.ll
llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll
llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
llvm/trunk/test/CodeGen/X86/avx512-ext.ll
llvm/trunk/test/CodeGen/X86/avx512-extract-subvector.ll
llvm/trunk/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
llvm/trunk/test/CodeGen/X86/avx512-insert-extract_i1.ll
llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
llvm/trunk/test/CodeGen/X86/avx512-mask-spills.ll
llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll
llvm/trunk/test/CodeGen/X86/avx512-masked_memop-16-8.ll
llvm/trunk/test/CodeGen/X86/avx512-skx-insert-subvec.ll
llvm/trunk/test/CodeGen/X86/avx512-trunc.ll
llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/trunk/test/CodeGen/X86/combine-testm-and.ll
llvm/trunk/test/CodeGen/X86/compress_expand.ll
llvm/trunk/test/CodeGen/X86/fast-isel-nontemporal.ll
llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
llvm/trunk/test/CodeGen/X86/masked_memop.ll
llvm/trunk/test/CodeGen/X86/nontemporal-2.ll
llvm/trunk/test/CodeGen/X86/pmul.ll
llvm/trunk/test/CodeGen/X86/pr29112.ll
llvm/trunk/test/CodeGen/X86/pr30284.ll
llvm/trunk/test/CodeGen/X86/sad.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/trunk/test/CodeGen/X86/sse-fsignum.ll
llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll
llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll
llvm/trunk/test/CodeGen/X86/vec_fpext.ll
llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
llvm/trunk/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll
llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll
llvm/trunk/test/CodeGen/X86/vector-compare-results.ll
llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll
llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll
llvm/trunk/test/CodeGen/X86/vector-sext.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
llvm/trunk/test/CodeGen/X86/vector-trunc.ll
llvm/trunk/test/CodeGen/X86/vector-tzcnt-128.ll
Modified: llvm/trunk/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86.td (original)
+++ llvm/trunk/lib/Target/X86/X86.td Fri Mar 3 03:03:24 2017
@@ -238,11 +238,12 @@ def FeatureSlowIncDec : SubtargetFeature
def FeatureSoftFloat
: SubtargetFeature<"soft-float", "UseSoftFloat", "true",
"Use software floating point features.">;
-// On at least some AMD processors, there is no performance hazard to writing
-// only the lower parts of a YMM register without clearing the upper part.
-def FeatureFastPartialYMMWrite
- : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
- "true", "Partial writes to YMM registers are fast">;
+// On some X86 processors, there is no performance hazard to writing only the
+// lower parts of a YMM or ZMM register without clearing the upper part.
+def FeatureFastPartialYMMorZMMWrite
+ : SubtargetFeature<"fast-partial-ymm-or-zmm-write",
+ "HasFastPartialYMMorZMMWrite",
+ "true", "Partial writes to YMM/ZMM registers are fast">;
// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
// vector FSQRT has higher throughput than the corresponding NR code.
@@ -545,7 +546,8 @@ class KnightsLandingProc<string Name> :
FeatureLZCNT,
FeatureBMI,
FeatureBMI2,
- FeatureFMA
+ FeatureFMA,
+ FeatureFastPartialYMMorZMMWrite
]>;
def : KnightsLandingProc<"knl">;
@@ -659,7 +661,7 @@ def : ProcessorModel<"btver2", BtVer2Mod
FeatureXSAVEOPT,
FeatureSlowSHLD,
FeatureLAHFSAHF,
- FeatureFastPartialYMMWrite
+ FeatureFastPartialYMMorZMMWrite
]>;
// Bulldozer
Modified: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.cpp?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp Fri Mar 3 03:03:24 2017
@@ -298,7 +298,7 @@ void X86Subtarget::initializeEnvironment
HasSSEUnalignedMem = false;
HasCmpxchg16b = false;
UseLeaForSP = false;
- HasFastPartialYMMWrite = false;
+ HasFastPartialYMMorZMMWrite = false;
HasFastScalarFSQRT = false;
HasFastVectorFSQRT = false;
HasFastLZCNT = false;
Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Fri Mar 3 03:03:24 2017
@@ -207,8 +207,8 @@ protected:
bool UseLeaForSP;
/// True if there is no performance penalty to writing only the lower parts
- /// of a YMM register without clearing the upper part.
- bool HasFastPartialYMMWrite;
+ /// of a YMM or ZMM register without clearing the upper part.
+ bool HasFastPartialYMMorZMMWrite;
/// True if hardware SQRTSS instruction is at least as fast (latency) as
/// RSQRTSS followed by a Newton-Raphson iteration.
@@ -465,7 +465,9 @@ public:
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
- bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+ bool hasFastPartialYMMorZMMWrite() const {
+ return HasFastPartialYMMorZMMWrite;
+ }
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasFastLZCNT() const { return HasFastLZCNT; }
Modified: llvm/trunk/lib/Target/X86/X86VZeroUpper.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86VZeroUpper.cpp?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86VZeroUpper.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86VZeroUpper.cpp Fri Mar 3 03:03:24 2017
@@ -56,11 +56,11 @@ namespace {
// Core algorithm state:
// BlockState - Each block is either:
- // - PASS_THROUGH: There are neither YMM dirtying instructions nor
+ // - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor
// vzeroupper instructions in this block.
// - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
- // block that will ensure that YMM is clean on exit.
- // - EXITS_DIRTY: An instruction in the block dirties YMM and no
+ // block that will ensure that YMM/ZMM is clean on exit.
+ // - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no
// subsequent vzeroupper in the block clears it.
//
// AddedToDirtySuccessors - This flag is raised when a block is added to the
@@ -106,51 +106,54 @@ const char* VZeroUpperInserter::getBlock
llvm_unreachable("Invalid block exit state.");
}
-static bool isYmmReg(unsigned Reg) {
- return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
+/// VZEROUPPER cleans state that is related to Y/ZMM0-15 only.
+/// Thus, there is no need to check for Y/ZMM16 and above.
+static bool isYmmOrZmmReg(unsigned Reg) {
+ return (Reg >= X86::YMM0 && Reg <= X86::YMM15) ||
+ (Reg >= X86::ZMM0 && Reg <= X86::ZMM15);
}
-static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) {
for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
E = MRI.livein_end(); I != E; ++I)
- if (isYmmReg(I->first))
+ if (isYmmOrZmmReg(I->first))
return true;
return false;
}
-static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) {
for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
if (!MO.clobbersPhysReg(reg))
return false;
}
+ for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) {
+ if (!MO.clobbersPhysReg(reg))
+ return false;
+ }
return true;
}
-static bool hasYmmReg(MachineInstr &MI) {
+static bool hasYmmOrZmmReg(MachineInstr &MI) {
for (const MachineOperand &MO : MI.operands()) {
- if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+ if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO))
return true;
if (!MO.isReg())
continue;
if (MO.isDebug())
continue;
- if (isYmmReg(MO.getReg()))
+ if (isYmmOrZmmReg(MO.getReg()))
return true;
}
return false;
}
-/// Check if any YMM register will be clobbered by this instruction.
-static bool callClobbersAnyYmmReg(MachineInstr &MI) {
+/// Check if given call instruction has a RegMask operand.
+static bool callHasRegMask(MachineInstr &MI) {
assert(MI.isCall() && "Can only be called on call instructions.");
for (const MachineOperand &MO : MI.operands()) {
- if (!MO.isRegMask())
- continue;
- for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
- if (MO.clobbersPhysReg(reg))
- return true;
- }
+ if (MO.isRegMask())
+ return true;
}
return false;
}
@@ -175,17 +178,20 @@ void VZeroUpperInserter::addDirtySuccess
/// Loop over all of the instructions in the basic block, inserting vzeroupper
/// instructions before function calls.
void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
-
// Start by assuming that the block is PASS_THROUGH which implies no unguarded
// calls.
BlockExitState CurState = PASS_THROUGH;
BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
for (MachineInstr &MI : MBB) {
+ bool IsCall = MI.isCall();
+ bool IsReturn = MI.isReturn();
+ bool IsControlFlow = IsCall || IsReturn;
+
// No need for vzeroupper before iret in interrupt handler function,
- // epilogue will restore YMM registers if needed.
- bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn();
- bool IsControlFlow = MI.isCall() || MI.isReturn();
+ // epilogue will restore YMM/ZMM registers if needed.
+ if (IsX86INTR && IsReturn)
+ continue;
// An existing VZERO* instruction resets the state.
if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
@@ -194,30 +200,30 @@ void VZeroUpperInserter::processBasicBlo
}
// Shortcut: don't need to check regular instructions in dirty state.
- if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
+ if (!IsControlFlow && CurState == EXITS_DIRTY)
continue;
- if (hasYmmReg(MI)) {
- // We found a ymm-using instruction; this could be an AVX instruction,
- // or it could be control flow.
+ if (hasYmmOrZmmReg(MI)) {
+ // We found a ymm/zmm-using instruction; this could be an AVX/AVX512
+ // instruction, or it could be control flow.
CurState = EXITS_DIRTY;
continue;
}
// Check for control-flow out of the current function (which might
// indirectly execute SSE instructions).
- if (!IsControlFlow || IsReturnFromX86INTR)
+ if (!IsControlFlow)
continue;
- // If the call won't clobber any YMM register, skip it as well. It usually
- // happens on helper function calls (such as '_chkstk', '_ftol2') where
- // standard calling convention is not used (RegMask is not used to mark
- // register clobbered and register usage (def/imp-def/use) is well-defined
- // and explicitly specified.
- if (MI.isCall() && !callClobbersAnyYmmReg(MI))
+ // If the call has no RegMask, skip it as well. It usually happens on
+ // helper function calls (such as '_chkstk', '_ftol2') where standard
+ // calling convention is not used (RegMask is not used to mark register
+ // clobbered and register usage (def/imp-def/use) is well-defined and
+ // explicitly specified.
+ if (IsCall && !callHasRegMask(MI))
continue;
- // The VZEROUPPER instruction resets the upper 128 bits of all AVX
+ // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15
// registers. In addition, the processor changes back to Clean state, after
// which execution of SSE instructions or AVX instructions has no transition
// penalty. Add the VZEROUPPER instruction before any function call/return
@@ -226,7 +232,7 @@ void VZeroUpperInserter::processBasicBlo
// predecessor block.
if (CurState == EXITS_DIRTY) {
// After the inserted VZEROUPPER the state becomes clean again, but
- // other YMM may appear before other subsequent calls or even before
+ // other YMM/ZMM may appear before other subsequent calls or even before
// the end of the BB.
insertVZeroUpper(MI, MBB);
CurState = EXITS_CLEAN;
@@ -257,30 +263,32 @@ void VZeroUpperInserter::processBasicBlo
/// function calls.
bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite())
+ if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite())
return false;
TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
EverMadeChange = false;
IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
- bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
+ bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI);
- // Fast check: if the function doesn't use any ymm registers, we don't need
- // to insert any VZEROUPPER instructions. This is constant-time, so it is
- // cheap in the common case of no ymm use.
- bool YMMUsed = FnHasLiveInYmm;
- if (!YMMUsed) {
- const TargetRegisterClass *RC = &X86::VR256RegClass;
- for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
- i++) {
- if (!MRI.reg_nodbg_empty(*i)) {
- YMMUsed = true;
- break;
+ // Fast check: if the function doesn't use any ymm/zmm registers, we don't
+ // need to insert any VZEROUPPER instructions. This is constant-time, so it
+ // is cheap in the common case of no ymm/zmm use.
+ bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
+ const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass};
+ for (auto *RC : RCs) {
+ if (!YmmOrZmmUsed) {
+ for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+ i++) {
+ if (!MRI.reg_nodbg_empty(*i)) {
+ YmmOrZmmUsed = true;
+ break;
+ }
}
}
}
- if (!YMMUsed) {
+ if (!YmmOrZmmUsed) {
return false;
}
@@ -294,9 +302,9 @@ bool VZeroUpperInserter::runOnMachineFun
for (MachineBasicBlock &MBB : MF)
processBasicBlock(MBB);
- // If any YMM regs are live-in to this function, add the entry block to the
- // DirtySuccessors list
- if (FnHasLiveInYmm)
+ // If any YMM/ZMM regs are live-in to this function, add the entry block to
+ // the DirtySuccessors list
+ if (FnHasLiveInYmmOrZmm)
addDirtySuccessor(MF.front());
// Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
Modified: llvm/trunk/test/CodeGen/X86/avg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avg.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avg.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avg.ll Fri Mar 3 03:03:24 2017
@@ -234,6 +234,7 @@ define void @avg_v32i8(<32 x i8>* %a, <3
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i8:
@@ -241,6 +242,7 @@ define void @avg_v32i8(<32 x i8>* %a, <3
; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0
; AVX512BW-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i8>, <32 x i8>* %a
%2 = load <32 x i8>, <32 x i8>* %b
@@ -562,6 +564,7 @@ define void @avg_v64i8(<64 x i8>* %a, <6
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8:
@@ -569,6 +572,7 @@ define void @avg_v64i8(<64 x i8>* %a, <6
; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %a
%2 = load <64 x i8>, <64 x i8>* %b
@@ -726,6 +730,7 @@ define void @avg_v16i16(<16 x i16>* %a,
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v16i16:
@@ -733,6 +738,7 @@ define void @avg_v16i16(<16 x i16>* %a,
; AVX512BW-NEXT: vmovdqa (%rsi), %ymm0
; AVX512BW-NEXT: vpavgw (%rdi), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a
%2 = load <16 x i16>, <16 x i16>* %b
@@ -888,6 +894,7 @@ define void @avg_v32i16(<32 x i16>* %a,
; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i16:
@@ -895,6 +902,7 @@ define void @avg_v32i16(<32 x i16>* %a,
; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %a
%2 = load <32 x i16>, <32 x i16>* %b
@@ -1138,6 +1146,7 @@ define void @avg_v32i8_2(<32 x i8>* %a,
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i8_2:
@@ -1145,6 +1154,7 @@ define void @avg_v32i8_2(<32 x i8>* %a,
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpavgb (%rsi), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i8>, <32 x i8>* %a
%2 = load <32 x i8>, <32 x i8>* %b
@@ -1392,6 +1402,7 @@ define void @avg_v64i8_2(<64 x i8>* %a,
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8_2:
@@ -1399,6 +1410,7 @@ define void @avg_v64i8_2(<64 x i8>* %a,
; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %a
%2 = load <64 x i8>, <64 x i8>* %b
@@ -1557,6 +1569,7 @@ define void @avg_v16i16_2(<16 x i16>* %a
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v16i16_2:
@@ -1564,6 +1577,7 @@ define void @avg_v16i16_2(<16 x i16>* %a
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a
%2 = load <16 x i16>, <16 x i16>* %b
@@ -1719,6 +1733,7 @@ define void @avg_v32i16_2(<32 x i16>* %a
; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i16_2:
@@ -1726,6 +1741,7 @@ define void @avg_v32i16_2(<32 x i16>* %a
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %a
%2 = load <32 x i16>, <32 x i16>* %b
@@ -1924,6 +1940,7 @@ define void @avg_v32i8_const(<32 x i8>*
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i8_const:
@@ -1931,6 +1948,7 @@ define void @avg_v32i8_const(<32 x i8>*
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i8>, <32 x i8>* %a
%2 = zext <32 x i8> %1 to <32 x i32>
@@ -2148,6 +2166,7 @@ define void @avg_v64i8_const(<64 x i8>*
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
; AVX512F-NEXT: vmovdqu %ymm2, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8_const:
@@ -2155,6 +2174,7 @@ define void @avg_v64i8_const(<64 x i8>*
; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %a
%2 = zext <64 x i8> %1 to <64 x i32>
@@ -2288,6 +2308,7 @@ define void @avg_v16i16_const(<16 x i16>
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v16i16_const:
@@ -2295,6 +2316,7 @@ define void @avg_v16i16_const(<16 x i16>
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a
%2 = zext <16 x i16> %1 to <16 x i32>
@@ -2411,6 +2433,7 @@ define void @avg_v32i16_const(<32 x i16>
; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1
; AVX512F-NEXT: vpmovdw %zmm1, (%rax)
; AVX512F-NEXT: vpmovdw %zmm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i16_const:
@@ -2418,6 +2441,7 @@ define void @avg_v32i16_const(<32 x i16>
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %a
%2 = zext <32 x i16> %1 to <32 x i32>
Modified: llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-intrinsics-x86.ll Fri Mar 3 03:03:24 2017
@@ -149,6 +149,7 @@ define <4 x float> @test_x86_avx_cvt_pd2
; AVX512VL-LABEL: test_x86_avx_cvt_pd2_ps_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvtpd2ps %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
@@ -166,6 +167,7 @@ define <4 x i32> @test_x86_avx_cvt_pd2dq
; AVX512VL-LABEL: test_x86_avx_cvt_pd2dq_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvtpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
@@ -210,6 +212,7 @@ define <4 x i32> @test_x86_avx_cvtt_pd2d
; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
@@ -361,18 +364,12 @@ declare void @llvm.x86.avx.maskstore.pd(
define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) {
-; AVX-LABEL: test_x86_avx_maskstore_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_maskstore_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_maskstore_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %mask, <4 x double> %a2)
ret void
}
@@ -392,18 +389,12 @@ declare void @llvm.x86.avx.maskstore.ps(
define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) {
-; AVX-LABEL: test_x86_avx_maskstore_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_maskstore_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_maskstore_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %mask, <8 x float> %a2)
ret void
}
@@ -475,16 +466,11 @@ declare <8 x float> @llvm.x86.avx.min.ps
define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) {
-; AVX-LABEL: test_x86_avx_movmsk_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_movmsk_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_movmsk_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -492,16 +478,11 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(
define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_movmsk_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_movmsk_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_movmsk_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -514,20 +495,13 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(
define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) {
-; AVX-LABEL: test_x86_avx_ptestc_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_ptestc_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX512VL-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX512VL-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_ptestc_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
+; CHECK-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -535,20 +509,13 @@ declare i32 @llvm.x86.avx.ptestc.256(<4
define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) {
-; AVX-LABEL: test_x86_avx_ptestnzc_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_ptestnzc_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_ptestnzc_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -556,20 +523,13 @@ declare i32 @llvm.x86.avx.ptestnzc.256(<
define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) {
-; AVX-LABEL: test_x86_avx_ptestz_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_ptestz_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_ptestz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -797,20 +757,13 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x
define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; AVX-LABEL: test_x86_avx_vtestc_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestc_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX512VL-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX512VL-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestc_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
+; CHECK-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -831,20 +784,13 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x
define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; AVX-LABEL: test_x86_avx_vtestc_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestc_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX512VL-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX512VL-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestc_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT: sbbl %eax, %eax ## encoding: [0x19,0xc0]
+; CHECK-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -865,20 +811,13 @@ declare i32 @llvm.x86.avx.vtestnzc.pd(<2
define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; AVX-LABEL: test_x86_avx_vtestnzc_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestnzc_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestnzc_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -899,20 +838,13 @@ declare i32 @llvm.x86.avx.vtestnzc.ps(<4
define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; AVX-LABEL: test_x86_avx_vtestnzc_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestnzc_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX512VL-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestnzc_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -933,20 +865,13 @@ declare i32 @llvm.x86.avx.vtestz.pd(<2 x
define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; AVX-LABEL: test_x86_avx_vtestz_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestz_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestz_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -967,20 +892,13 @@ declare i32 @llvm.x86.avx.vtestz.ps(<4 x
define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; AVX-LABEL: test_x86_avx_vtestz_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestz_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX512VL-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestz_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1024,6 +942,7 @@ define void @movnt_dq(i8* %p, <2 x i64>
; AVX512VL-NEXT: vpaddq LCPI65_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI65_0, kind: FK_Data_4
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
%a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -1044,6 +963,7 @@ define void @movnt_ps(i8* %p, <8 x float
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; AVX512VL-NEXT: vmovntps %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x00]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind
ret void
@@ -1067,6 +987,7 @@ define void @movnt_pd(i8* %p, <4 x doubl
; AVX512VL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x57,0xc9]
; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
; AVX512VL-NEXT: vmovntpd %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x00]
+; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind
Modified: llvm/trunk/test/CodeGen/X86/avx-vzeroupper.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-vzeroupper.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-vzeroupper.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-vzeroupper.ll Fri Mar 3 03:03:24 2017
@@ -1,8 +1,9 @@
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-write | FileCheck --check-prefix=FASTYMM %s
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck --check-prefix=FAST-YMM-ZMM %s
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s
-; FASTYMM-NOT: vzeroupper
+; FAST-YMM-ZMM-NOT: vzeroupper
; BTVER2-NOT: vzeroupper
declare i32 @foo()
Modified: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll Fri Mar 3 03:03:24 2017
@@ -227,16 +227,11 @@ declare <32 x i8> @llvm.x86.avx2.pminu.b
define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) {
-; AVX2-LABEL: test_x86_avx2_pmovmskb:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
-; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_pmovmskb:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pmovmskb:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1179,18 +1174,12 @@ declare void @llvm.x86.avx2.maskstore.q(
define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
-; AVX2-LABEL: test_x86_avx2_maskstore_q_256:
-; AVX2: ## BB#0:
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
-; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_maskstore_q_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_maskstore_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2)
ret void
}
@@ -1210,18 +1199,12 @@ declare void @llvm.x86.avx2.maskstore.d(
define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
-; AVX2-LABEL: test_x86_avx2_maskstore_d_256:
-; AVX2: ## BB#0:
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
-; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_maskstore_d_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_maskstore_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2)
ret void
}
@@ -1522,18 +1505,12 @@ declare <4 x float> @llvm.x86.avx2.gathe
<2 x i64>, <4 x float>, i8) nounwind readonly
define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask) {
-; AVX2-LABEL: test_x86_avx2_gather_q_ps_256:
-; AVX2: ## BB#0:
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
-; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_gather_q_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_gather_q_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
@@ -1633,18 +1610,12 @@ declare <4 x i32> @llvm.x86.avx2.gather.
<2 x i64>, <4 x i32>, i8) nounwind readonly
define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask) {
-; AVX2-LABEL: test_x86_avx2_gather_q_d_256:
-; AVX2: ## BB#0:
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
-; AVX2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_gather_q_d_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_gather_q_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
Modified: llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll Fri Mar 3 03:03:24 2017
@@ -1200,6 +1200,7 @@ define void @isel_crash_32b(i8* %cV_R.ad
; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
; X32-AVX512VL-NEXT: movl %ebp, %esp
; X32-AVX512VL-NEXT: popl %ebp
+; X32-AVX512VL-NEXT: vzeroupper
; X32-AVX512VL-NEXT: retl
;
; X64-AVX512VL-LABEL: isel_crash_32b:
@@ -1223,6 +1224,7 @@ define void @isel_crash_32b(i8* %cV_R.ad
; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq %rbp, %rsp
; X64-AVX512VL-NEXT: popq %rbp
+; X64-AVX512VL-NEXT: vzeroupper
; X64-AVX512VL-NEXT: retq
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
@@ -1347,6 +1349,7 @@ define void @isel_crash_16w(i16* %cV_R.a
; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
; X32-AVX512VL-NEXT: movl %ebp, %esp
; X32-AVX512VL-NEXT: popl %ebp
+; X32-AVX512VL-NEXT: vzeroupper
; X32-AVX512VL-NEXT: retl
;
; X64-AVX512VL-LABEL: isel_crash_16w:
@@ -1370,6 +1373,7 @@ define void @isel_crash_16w(i16* %cV_R.a
; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq %rbp, %rsp
; X64-AVX512VL-NEXT: popq %rbp
+; X64-AVX512VL-NEXT: vzeroupper
; X64-AVX512VL-NEXT: retq
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
@@ -1437,28 +1441,28 @@ entry:
}
define void @isel_crash_8d(i32* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_8d:
-; X32-AVX2: ## BB#0: ## %eintry
-; X32-AVX2-NEXT: pushl %ebp
-; X32-AVX2-NEXT: Lcfi9:
-; X32-AVX2-NEXT: .cfi_def_cfa_offset 8
-; X32-AVX2-NEXT: Lcfi10:
-; X32-AVX2-NEXT: .cfi_offset %ebp, -8
-; X32-AVX2-NEXT: movl %esp, %ebp
-; X32-AVX2-NEXT: Lcfi11:
-; X32-AVX2-NEXT: .cfi_def_cfa_register %ebp
-; X32-AVX2-NEXT: andl $-32, %esp
-; X32-AVX2-NEXT: subl $128, %esp
-; X32-AVX2-NEXT: movl 8(%ebp), %eax
-; X32-AVX2-NEXT: vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX2-NEXT: vmovaps %ymm0, (%esp)
-; X32-AVX2-NEXT: vbroadcastss (%eax), %ymm1
-; X32-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT: movl %ebp, %esp
-; X32-AVX2-NEXT: popl %ebp
-; X32-AVX2-NEXT: vzeroupper
-; X32-AVX2-NEXT: retl
+; X32-LABEL: isel_crash_8d:
+; X32: ## BB#0: ## %eintry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: Lcfi9:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: Lcfi10:
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: Lcfi11:
+; X32-NEXT: .cfi_def_cfa_register %ebp
+; X32-NEXT: andl $-32, %esp
+; X32-NEXT: subl $128, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: vmovaps %ymm0, (%esp)
+; X32-NEXT: vbroadcastss (%eax), %ymm1
+; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
;
; X64-AVX2-LABEL: isel_crash_8d:
; X64-AVX2: ## BB#0: ## %eintry
@@ -1484,28 +1488,6 @@ define void @isel_crash_8d(i32* %cV_R.ad
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
;
-; X32-AVX512VL-LABEL: isel_crash_8d:
-; X32-AVX512VL: ## BB#0: ## %eintry
-; X32-AVX512VL-NEXT: pushl %ebp
-; X32-AVX512VL-NEXT: Lcfi9:
-; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 8
-; X32-AVX512VL-NEXT: Lcfi10:
-; X32-AVX512VL-NEXT: .cfi_offset %ebp, -8
-; X32-AVX512VL-NEXT: movl %esp, %ebp
-; X32-AVX512VL-NEXT: Lcfi11:
-; X32-AVX512VL-NEXT: .cfi_def_cfa_register %ebp
-; X32-AVX512VL-NEXT: andl $-32, %esp
-; X32-AVX512VL-NEXT: subl $128, %esp
-; X32-AVX512VL-NEXT: movl 8(%ebp), %eax
-; X32-AVX512VL-NEXT: vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX512VL-NEXT: vmovaps %ymm0, (%esp)
-; X32-AVX512VL-NEXT: vbroadcastss (%eax), %ymm1
-; X32-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT: movl %ebp, %esp
-; X32-AVX512VL-NEXT: popl %ebp
-; X32-AVX512VL-NEXT: retl
-;
; X64-AVX512VL-LABEL: isel_crash_8d:
; X64-AVX512VL: ## BB#0: ## %eintry
; X64-AVX512VL-NEXT: pushq %rbp
@@ -1526,6 +1508,7 @@ define void @isel_crash_8d(i32* %cV_R.ad
; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq %rbp, %rsp
; X64-AVX512VL-NEXT: popq %rbp
+; X64-AVX512VL-NEXT: vzeroupper
; X64-AVX512VL-NEXT: retq
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
@@ -1676,6 +1659,7 @@ define void @isel_crash_4q(i64* %cV_R.ad
; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
; X32-AVX512VL-NEXT: movl %ebp, %esp
; X32-AVX512VL-NEXT: popl %ebp
+; X32-AVX512VL-NEXT: vzeroupper
; X32-AVX512VL-NEXT: retl
;
; X64-AVX512VL-LABEL: isel_crash_4q:
@@ -1698,6 +1682,7 @@ define void @isel_crash_4q(i64* %cV_R.ad
; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq %rbp, %rsp
; X64-AVX512VL-NEXT: popq %rbp
+; X64-AVX512VL-NEXT: vzeroupper
; X64-AVX512VL-NEXT: retq
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
Modified: llvm/trunk/test/CodeGen/X86/avx512-any_extend_load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-any_extend_load.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-any_extend_load.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-any_extend_load.ll Fri Mar 3 03:03:24 2017
@@ -4,12 +4,20 @@
define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
-; ALL-LABEL: any_extend_load_v8i64:
-; ALL: # BB#0:
-; ALL-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; ALL-NEXT: vpmovqb %zmm0, (%rdi)
-; ALL-NEXT: retq
+; KNL-LABEL: any_extend_load_v8i64:
+; KNL: # BB#0:
+; KNL-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL-NEXT: vpmovqb %zmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: any_extend_load_v8i64:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT: vpmovqb %zmm0, (%rdi)
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
%1 = zext <8 x i8> %wide.load to <8 x i64>
%2 = add nuw nsw <8 x i64> %1, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
@@ -33,6 +41,7 @@ define void @any_extend_load_v8i32(<8 x
; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; SKX-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; SKX-NEXT: vpmovdb %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
%1 = zext <8 x i8> %wide.load to <8 x i32>
Modified: llvm/trunk/test/CodeGen/X86/avx512-arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-arith.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-arith.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-arith.ll Fri Mar 3 03:03:24 2017
@@ -233,6 +233,7 @@ define <2 x i64> @imulq128(<2 x i64> %y,
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: imulq128:
Modified: llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll Fri Mar 3 03:03:24 2017
@@ -140,6 +140,7 @@ define <8 x i32> @test5(<8 x i32>%a, <8
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _func8xi1
; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX-NEXT: vpslld $31, %ymm0, %ymm0
@@ -192,6 +193,7 @@ define <16 x i32> @test6(<16 x i32>%a, <
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _func16xi1
; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX-NEXT: vpslld $31, %zmm0, %zmm0
@@ -291,6 +293,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: callq _func8xi1
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
Modified: llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cvt.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cvt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cvt.ll Fri Mar 3 03:03:24 2017
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
define <16 x float> @sitof32(<16 x i32> %a) nounwind {
@@ -110,40 +110,78 @@ define <2 x float> @sltof2f32(<2 x i64>
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%b = sitofp <2 x i64> %a to <2 x float>
ret <2 x float>%b
}
define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
-; NODQ-LABEL: sltof4f32_mem:
-; NODQ: ## BB#0:
-; NODQ-NEXT: vmovdqu (%rdi), %ymm0
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT: retq
+; KNL-LABEL: sltof4f32_mem:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqu (%rdi), %ymm0
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
;
; VLDQ-LABEL: sltof4f32_mem:
; VLDQ: ## BB#0:
; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0
; VLDQ-NEXT: retq
;
+; VLNODQ-LABEL: sltof4f32_mem:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vmovdqu (%rdi), %ymm0
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VLNODQ-NEXT: vzeroupper
+; VLNODQ-NEXT: retq
+;
; AVX512DQ-LABEL: sltof4f32_mem:
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: vmovups (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: sltof4f32_mem:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%a1 = load <4 x i64>, <4 x i64>* %a, align 8
%b = sitofp <4 x i64> %a1 to <4 x float>
ret <4 x float>%b
@@ -218,65 +256,137 @@ define <4 x i64> @f32tosl(<4 x float> %a
}
define <4 x float> @sltof432(<4 x i64> %a) {
-; NODQ-LABEL: sltof432:
-; NODQ: ## BB#0:
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT: retq
+; KNL-LABEL: sltof432:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
;
; VLDQ-LABEL: sltof432:
; VLDQ: ## BB#0:
; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
+; VLDQ-NEXT: vzeroupper
; VLDQ-NEXT: retq
;
+; VLNODQ-LABEL: sltof432:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VLNODQ-NEXT: vzeroupper
+; VLNODQ-NEXT: retq
+;
; AVX512DQ-LABEL: sltof432:
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: sltof432:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
define <4 x float> @ultof432(<4 x i64> %a) {
-; NODQ-LABEL: ultof432:
-; NODQ: ## BB#0:
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT: retq
+; KNL-LABEL: ultof432:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
;
; VLDQ-LABEL: ultof432:
; VLDQ: ## BB#0:
; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
+; VLDQ-NEXT: vzeroupper
; VLDQ-NEXT: retq
;
+; VLNODQ-LABEL: ultof432:
+; VLNODQ: ## BB#0:
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; VLNODQ-NEXT: vmovq %xmm0, %rax
+; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
+; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VLNODQ-NEXT: vzeroupper
+; VLNODQ-NEXT: retq
+;
; AVX512DQ-LABEL: ultof432:
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: ultof432:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%b = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
@@ -355,17 +465,33 @@ define <8 x i32> @fptoui_256(<8 x float>
}
define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
-; NOVL-LABEL: fptoui_128:
-; NOVL: ## BB#0:
-; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
-; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; NOVL-NEXT: retq
+; KNL-LABEL: fptoui_128:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
;
; VL-LABEL: fptoui_128:
; VL: ## BB#0:
; VL-NEXT: vcvttps2udq %xmm0, %xmm0
; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptoui_128:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: fptoui_128:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT: vcvttps2udq %zmm0, %zmm0
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%b = fptoui <4 x float> %a to <4 x i32>
ret <4 x i32> %b
}
@@ -380,17 +506,34 @@ define <8 x i32> @fptoui01(<8 x double>
}
define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
-; NOVL-LABEL: fptoui_256d:
-; NOVL: ## BB#0:
-; NOVL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
-; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; NOVL-NEXT: retq
+; KNL-LABEL: fptoui_256d:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: retq
;
; VL-LABEL: fptoui_256d:
; VL: ## BB#0:
; VL-NEXT: vcvttpd2udq %ymm0, %xmm0
+; VL-NEXT: vzeroupper
; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptoui_256d:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: fptoui_256d:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT: vcvttpd2udq %zmm0, %ymm0
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%b = fptoui <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
@@ -448,10 +591,16 @@ define <8 x i32> @fptosi01(<8 x double>
}
define <4 x i32> @fptosi03(<4 x double> %a) {
-; ALL-LABEL: fptosi03:
-; ALL: ## BB#0:
-; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; ALL-NEXT: retq
+; KNL-LABEL: fptosi03:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; KNL-NEXT: retq
+;
+; AVX512-LABEL: fptosi03:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
@@ -475,29 +624,54 @@ define <16 x float> @fptrunc00(<16 x dou
}
define <4 x float> @fptrunc01(<4 x double> %b) {
-; ALL-LABEL: fptrunc01:
-; ALL: ## BB#0:
-; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT: retq
+; KNL-LABEL: fptrunc01:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0
+; KNL-NEXT: retq
+;
+; AVX512-LABEL: fptrunc01:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vcvtpd2ps %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
ret <4 x float> %a
}
define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
-; NOVL-LABEL: fptrunc02:
-; NOVL: ## BB#0:
-; NOVL-NEXT: vpslld $31, %xmm1, %xmm1
-; NOVL-NEXT: vpsrad $31, %xmm1, %xmm1
-; NOVL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; NOVL-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NOVL-NEXT: retq
+; KNL-LABEL: fptrunc02:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0
+; KNL-NEXT: vpand %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
;
; VL-LABEL: fptrunc02:
; VL: ## BB#0:
; VL-NEXT: vpslld $31, %xmm1, %xmm1
; VL-NEXT: vptestmd %xmm1, %xmm1, %k1
; VL-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; VL-NEXT: vzeroupper
; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptrunc02:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX512DQ-NEXT: vcvtpd2ps %ymm0, %xmm0
+; AVX512DQ-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: fptrunc02:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX512BW-NEXT: vcvtpd2ps %ymm0, %xmm0
+; AVX512BW-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
ret <4 x float> %c
@@ -761,17 +935,33 @@ define <8 x float> @uitof32_256(<8 x i32
}
define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
-; NOVL-LABEL: uitof32_128:
-; NOVL: ## BB#0:
-; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; NOVL-NEXT: retq
+; KNL-LABEL: uitof32_128:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
;
; VL-LABEL: uitof32_128:
; VL: ## BB#0:
; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; VL-NEXT: retq
+;
+; AVX512DQ-LABEL: uitof32_128:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: uitof32_128:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x float>
ret <4 x float> %b
}
Modified: llvm/trunk/test/CodeGen/X86/avx512-ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-ext.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-ext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-ext.ll Fri Mar 3 03:03:24 2017
@@ -1336,12 +1336,20 @@ define i16 @trunc_16i8_to_16i1(<16 x i8>
}
define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
-; ALL-LABEL: trunc_16i32_to_16i1:
-; ALL: ## BB#0:
-; ALL-NEXT: vpslld $31, %zmm0, %zmm0
-; ALL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; ALL-NEXT: kmovw %k0, %eax
-; ALL-NEXT: retq
+; KNL-LABEL: trunc_16i32_to_16i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: trunc_16i32_to_16i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %zmm0, %zmm0
+; SKX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%mask_b = trunc <16 x i32>%a to <16 x i1>
%mask = bitcast <16 x i1> %mask_b to i16
ret i16 %mask
@@ -1441,6 +1449,7 @@ define <8 x i16> @sext_8i1_8i16(<8 x i32
; SKX: ## BB#0:
; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = icmp slt <8 x i32> %a1, %a2
%y = sext <8 x i1> %x to <8 x i16>
@@ -1482,11 +1491,18 @@ define <8 x i64> @sext_8i1_8i64(<8 x i32
}
define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
-; ALL-LABEL: extload_v8i64:
-; ALL: ## BB#0:
-; ALL-NEXT: vpmovsxbq (%rdi), %zmm0
-; ALL-NEXT: vmovdqa64 %zmm0, (%rsi)
-; ALL-NEXT: retq
+; KNL-LABEL: extload_v8i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbq (%rdi), %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, (%rsi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: extload_v8i64:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmovsxbq (%rdi), %zmm0
+; SKX-NEXT: vmovdqa64 %zmm0, (%rsi)
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%sign_load = load <8 x i8>, <8 x i8>* %a
%c = sext <8 x i8> %sign_load to <8 x i64>
store <8 x i64> %c, <8 x i64>* %res
Modified: llvm/trunk/test/CodeGen/X86/avx512-extract-subvector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-extract-subvector.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-extract-subvector.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-extract-subvector.ll Fri Mar 3 03:03:24 2017
@@ -6,6 +6,7 @@ define <8 x i16> @extract_subvector128_v
; SKX-LABEL: extract_subvector128_v32i16:
; SKX: ## BB#0:
; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
ret <8 x i16> %r1
@@ -15,6 +16,7 @@ define <8 x i16> @extract_subvector128_v
; SKX-LABEL: extract_subvector128_v32i16_first_element:
; SKX: ## BB#0:
; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %r1
@@ -24,6 +26,7 @@ define <16 x i8> @extract_subvector128_v
; SKX-LABEL: extract_subvector128_v64i8:
; SKX: ## BB#0:
; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
ret <16 x i8> %r1
@@ -33,6 +36,7 @@ define <16 x i8> @extract_subvector128_v
; SKX-LABEL: extract_subvector128_v64i8_first_element:
; SKX: ## BB#0:
; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %r1
@@ -61,6 +65,7 @@ define void @extract_subvector256_v8f64_
; SKX-LABEL: extract_subvector256_v8f64_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
@@ -73,6 +78,7 @@ define void @extract_subvector256_v8f32_
; SKX-LABEL: extract_subvector256_v8f32_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -85,6 +91,7 @@ define void @extract_subvector256_v4i64_
; SKX-LABEL: extract_subvector256_v4i64_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
@@ -97,6 +104,7 @@ define void @extract_subvector256_v8i32_
; SKX-LABEL: extract_subvector256_v8i32_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -109,6 +117,7 @@ define void @extract_subvector256_v16i16
; SKX-LABEL: extract_subvector256_v16i16_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -121,6 +130,7 @@ define void @extract_subvector256_v32i8_
; SKX-LABEL: extract_subvector256_v32i8_store:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -133,6 +143,7 @@ define void @extract_subvector256_v4f64_
; SKX-LABEL: extract_subvector256_v4f64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -145,6 +156,7 @@ define void @extract_subvector256_v4f64_
; SKX-LABEL: extract_subvector256_v4f64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -157,6 +169,7 @@ define void @extract_subvector256_v4f32_
; SKX-LABEL: extract_subvector256_v4f32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -169,6 +182,7 @@ define void @extract_subvector256_v4f32_
; SKX-LABEL: extract_subvector256_v4f32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -181,6 +195,7 @@ define void @extract_subvector256_v2i64_
; SKX-LABEL: extract_subvector256_v2i64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -193,6 +208,7 @@ define void @extract_subvector256_v2i64_
; SKX-LABEL: extract_subvector256_v2i64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -205,6 +221,7 @@ define void @extract_subvector256_v4i32_
; SKX-LABEL: extract_subvector256_v4i32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -217,6 +234,7 @@ define void @extract_subvector256_v4i32_
; SKX-LABEL: extract_subvector256_v4i32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -229,6 +247,7 @@ define void @extract_subvector256_v8i16_
; SKX-LABEL: extract_subvector256_v8i16_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -241,6 +260,7 @@ define void @extract_subvector256_v8i16_
; SKX-LABEL: extract_subvector256_v8i16_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -253,6 +273,7 @@ define void @extract_subvector256_v16i8_
; SKX-LABEL: extract_subvector256_v16i8_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -265,6 +286,7 @@ define void @extract_subvector256_v16i8_
; SKX-LABEL: extract_subvector256_v16i8_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -277,6 +299,7 @@ define void @extract_subvector512_v2f64_
; SKX-LABEL: extract_subvector512_v2f64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -289,6 +312,7 @@ define void @extract_subvector512_v2f64_
; SKX-LABEL: extract_subvector512_v2f64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -301,6 +325,7 @@ define void @extract_subvector512_v4f32_
; SKX-LABEL: extract_subvector512_v4f32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -313,6 +338,7 @@ define void @extract_subvector512_v4f32_
; SKX-LABEL: extract_subvector512_v4f32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -325,6 +351,7 @@ define void @extract_subvector512_v2i64_
; SKX-LABEL: extract_subvector512_v2i64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -337,6 +364,7 @@ define void @extract_subvector512_v2i64_
; SKX-LABEL: extract_subvector512_v2i64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -349,6 +377,7 @@ define void @extract_subvector512_v4i32_
; SKX-LABEL: extract_subvector512_v4i32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -361,6 +390,7 @@ define void @extract_subvector512_v4i32_
; SKX-LABEL: extract_subvector512_v4i32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -373,6 +403,7 @@ define void @extract_subvector512_v8i16_
; SKX-LABEL: extract_subvector512_v8i16_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i16> %a, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -385,6 +416,7 @@ define void @extract_subvector512_v16i8_
; SKX-LABEL: extract_subvector512_v16i8_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <64 x i8> %a, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -397,6 +429,7 @@ define void @extract_subvector512_v16i8_
; SKX-LABEL: extract_subvector512_v16i8_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <64 x i8> %a, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -409,6 +442,7 @@ define void @extract_subvector512_v4f64_
; SKX-LABEL: extract_subvector512_v4f64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -421,6 +455,7 @@ define void @extract_subvector512_v4f64_
; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -433,6 +468,7 @@ define void @extract_subvector512_v4f64_
; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -445,6 +481,7 @@ define void @extract_subvector512_v8f32_
; SKX-LABEL: extract_subvector512_v8f32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -457,6 +494,7 @@ define void @extract_subvector512_v8f32_
; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -469,6 +507,7 @@ define void @extract_subvector512_v8f32_
; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -481,6 +520,7 @@ define void @extract_subvector512_v4i64_
; SKX-LABEL: extract_subvector512_v4i64_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -493,6 +533,7 @@ define void @extract_subvector512_v4i64_
; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -505,6 +546,7 @@ define void @extract_subvector512_v4i64_
; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -517,6 +559,7 @@ define void @extract_subvector512_v8i32_
; SKX-LABEL: extract_subvector512_v8i32_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -529,6 +572,7 @@ define void @extract_subvector512_v8i32_
; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -541,6 +585,7 @@ define void @extract_subvector512_v8i32_
; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -553,6 +598,7 @@ define void @extract_subvector512_v16i16
; SKX-LABEL: extract_subvector512_v16i16_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -565,6 +611,7 @@ define void @extract_subvector512_v16i16
; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -577,6 +624,7 @@ define void @extract_subvector512_v16i16
; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -589,6 +637,7 @@ define void @extract_subvector512_v32i8_
; SKX-LABEL: extract_subvector512_v32i8_store_lo:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -601,6 +650,7 @@ define void @extract_subvector512_v32i8_
; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_16:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -613,6 +663,7 @@ define void @extract_subvector512_v32i8_
; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_32:
; SKX: ## BB#0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -654,6 +705,7 @@ define <4 x float> @test_mm512_mask_extr
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %zmm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = bitcast <8 x double> %__A to <16 x float>
@@ -669,6 +721,7 @@ define <4 x float> @test_mm512_maskz_ext
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %zmm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = bitcast <8 x double> %__A to <16 x float>
@@ -684,6 +737,7 @@ define <2 x double> @test_mm256_mask_ext
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf64x2 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <4 x double> %__A, <4 x double> undef, <2 x i32> <i32 2, i32 3>
@@ -698,6 +752,7 @@ define <2 x double> @test_mm256_maskz_ex
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <4 x double> %__A, <4 x double> undef, <2 x i32> <i32 2, i32 3>
@@ -712,6 +767,7 @@ define <2 x i64> @test_mm256_mask_extrac
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextracti64x2 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <4 x i64> %__A, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
@@ -726,6 +782,7 @@ define <2 x i64> @test_mm256_maskz_extra
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <4 x i64> %__A, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
@@ -740,6 +797,7 @@ define <4 x float> @test_mm256_mask_extr
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %__A, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -754,6 +812,7 @@ define <4 x float> @test_mm256_maskz_ext
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %__A, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -768,6 +827,7 @@ define <2 x i64> @test_mm256_mask_extrac
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextracti32x4 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__A to <8 x i32>
@@ -785,6 +845,7 @@ define <2 x i64> @test_mm256_maskz_extra
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__A to <8 x i32>
@@ -827,6 +888,7 @@ define <2 x double> @test_mm512_mask_ext
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf64x2 $3, %zmm1, %xmm0 {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <8 x double> %__A, <8 x double> undef, <2 x i32> <i32 6, i32 7>
@@ -841,6 +903,7 @@ define <2 x double> @test_mm512_maskz_ex
; SKX: ## BB#0: ## %entry
; SKX-NEXT: kmovb %edi, %k1
; SKX-NEXT: vextractf64x2 $3, %zmm0, %xmm0 {%k1} {z}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
%shuffle = shufflevector <8 x double> %__A, <8 x double> undef, <2 x i32> <i32 6, i32 7>
Modified: llvm/trunk/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-gather-scatter-intrin.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-gather-scatter-intrin.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-gather-scatter-intrin.ll Fri Mar 3 03:03:24 2017
@@ -19,6 +19,7 @@ define void @gather_mask_dps(<16 x i32>
; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -34,6 +35,7 @@ define void @gather_mask_dpd(<8 x i32> %
; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -49,6 +51,7 @@ define void @gather_mask_qps(<8 x i64> %
; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
@@ -64,6 +67,7 @@ define void @gather_mask_qpd(<8 x i64> %
; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
@@ -91,6 +95,7 @@ define void @gather_mask_dd(<16 x i32> %
; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -106,6 +111,7 @@ define void @gather_mask_qd(<8 x i64> %i
; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
@@ -121,6 +127,7 @@ define void @gather_mask_qq(<8 x i64> %i
; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
@@ -136,6 +143,7 @@ define void @gather_mask_dq(<8 x i32> %i
; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -149,6 +157,7 @@ define void @gather_mask_dpd_execdomain(
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm1, (%rdx)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
store <8 x double> %x, <8 x double>* %stbuf
@@ -161,6 +170,7 @@ define void @gather_mask_qpd_execdomain(
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm1, (%rdx)
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
store <8 x double> %x, <8 x double>* %stbuf
@@ -195,6 +205,7 @@ define void @scatter_mask_dpd_execdomain
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vmovapd (%rdi), %zmm1
; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = load <8 x double>, <8 x double>* %src, align 64
call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
@@ -207,6 +218,7 @@ define void @scatter_mask_qpd_execdomain
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vmovapd (%rdi), %zmm1
; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = load <8 x double>, <8 x double>* %src, align 64
call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
@@ -219,6 +231,7 @@ define void @scatter_mask_dps_execdomain
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = load <16 x float>, <16 x float>* %src, align 64
call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
@@ -231,6 +244,7 @@ define void @scatter_mask_qps_execdomain
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vmovaps (%rdi), %ymm1
; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = load <8 x float>, <8 x float>* %src, align 32
call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
@@ -245,6 +259,7 @@ define void @gather_qps(<8 x i64> %ind,
; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
@@ -267,6 +282,7 @@ define void @prefetch(<8 x i64> %ind, i8
; CHECK-NEXT: movb $120, %al
; CHECK-NEXT: kmovb %eax, %k1
; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1)
@@ -300,7 +316,7 @@ define <2 x i64>@test_int_x86_avx512_gat
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
-; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
%res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
@@ -336,7 +352,7 @@ define <4 x i64>@test_int_x86_avx512_gat
; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; CHECK-NEXT: retq
%res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8)
%res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8)
@@ -391,6 +407,7 @@ define <4 x float>@test_int_x86_avx512_g
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm0 {%k1}
; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
@@ -409,6 +426,7 @@ define <4 x i32>@test_int_x86_avx512_gat
; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
%res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2)
@@ -474,7 +492,7 @@ define <4 x i64>@test_int_x86_avx512_gat
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
-; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
%res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
@@ -593,6 +611,7 @@ define void at test_int_x86_avx512_scatterd
; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
@@ -608,6 +627,7 @@ define void at test_int_x86_avx512_scatterd
; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
@@ -653,6 +673,7 @@ define void at test_int_x86_avx512_scatterd
; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
@@ -668,6 +689,7 @@ define void at test_int_x86_avx512_scatterd
; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
@@ -713,6 +735,7 @@ define void at test_int_x86_avx512_scatters
; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
@@ -728,6 +751,7 @@ define void at test_int_x86_avx512_scatters
; CHECK-NEXT: kxnorw %k0, %k0, %k2
; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
@@ -773,6 +797,7 @@ define void at test_int_x86_avx512_scatters
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
@@ -788,6 +813,7 @@ define void at test_int_x86_avx512_scatters
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
@@ -807,6 +833,7 @@ define void @scatter_mask_test(i8* %x0,
; CHECK-NEXT: movb $96, %al
; CHECK-NEXT: kmovb %eax, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Fri Mar 3 03:03:24 2017
@@ -148,6 +148,7 @@ define float @test7(<16 x float> %x, i32
; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
@@ -180,6 +181,7 @@ define double @test8(<8 x double> %x, i3
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
@@ -212,6 +214,7 @@ define float @test9(<8 x float> %x, i32
; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
@@ -244,6 +247,7 @@ define i32 @test10(<16 x i32> %x, i32 %i
; SKX-NEXT: movl (%rsp,%rdi,4), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
@@ -318,6 +322,7 @@ define i64 @test12(<16 x i64>%a, <16 x i
; SKX-NEXT: testb %al, %al
; SKX-NEXT: cmoveq %rsi, %rdi
; SKX-NEXT: movq %rdi, %rax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cmpvector_func.i = icmp slt <16 x i64> %a, %b
%extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
@@ -382,6 +387,7 @@ define i64 @test14(<8 x i64>%a, <8 x i64
; SKX-NEXT: testb %al, %al
; SKX-NEXT: cmoveq %rsi, %rdi
; SKX-NEXT: movq %rdi, %rax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cmpvector_func.i = icmp slt <8 x i64> %a, %b
%extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
@@ -441,6 +447,7 @@ define i16 @test16(i1 *%addr, i16 %a) {
; SKX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovd2m %zmm2, %k0
; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i16 %a to <16 x i1>
@@ -477,6 +484,7 @@ define i8 @test17(i1 *%addr, i8 %a) {
; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovq2m %zmm2, %k0
; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i8 %a to <8 x i1>
@@ -498,6 +506,7 @@ define i64 @extract_v8i64(<8 x i64> %x,
; SKX-NEXT: vpextrq $1, %xmm0, %rax
; SKX-NEXT: vextracti64x2 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <8 x i64> %x, i32 1
%r2 = extractelement <8 x i64> %x, i32 3
@@ -518,6 +527,7 @@ define i64 @extract_v4i64(<4 x i64> %x,
; SKX-NEXT: vpextrq $1, %xmm0, %rax
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <4 x i64> %x, i32 1
%r2 = extractelement <4 x i64> %x, i32 3
@@ -556,6 +566,7 @@ define i32 @extract_v16i32(<16 x i32> %x
; SKX-NEXT: vpextrd $1, %xmm0, %eax
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <16 x i32> %x, i32 1
%r2 = extractelement <16 x i32> %x, i32 5
@@ -576,6 +587,7 @@ define i32 @extract_v8i32(<8 x i32> %x,
; SKX-NEXT: vpextrd $1, %xmm0, %eax
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <8 x i32> %x, i32 1
%r2 = extractelement <8 x i32> %x, i32 5
@@ -616,6 +628,7 @@ define i16 @extract_v32i16(<32 x i16> %x
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <32 x i16> %x, i32 1
%r2 = extractelement <32 x i16> %x, i32 9
@@ -638,6 +651,7 @@ define i16 @extract_v16i16(<16 x i16> %x
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <16 x i16> %x, i32 1
%r2 = extractelement <16 x i16> %x, i32 9
@@ -680,6 +694,7 @@ define i8 @extract_v64i8(<64 x i8> %x, i
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <64 x i8> %x, i32 1
%r2 = extractelement <64 x i8> %x, i32 17
@@ -702,6 +717,7 @@ define i8 @extract_v32i8(<32 x i8> %x, i
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = extractelement <32 x i8> %x, i32 1
%r2 = extractelement <32 x i8> %x, i32 17
@@ -1272,6 +1288,7 @@ define i32 @test_insertelement_v32i1(i32
; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
; SKX-NEXT: vpmovw2m %zmm2, %k0
; SKX-NEXT: kmovd %k0, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cmp_res_i1 = icmp ult i32 %a, %b
%cmp_cmp_vec = icmp ult <32 x i32> %x, %y
@@ -1493,6 +1510,7 @@ define zeroext i8 @test_extractelement_v
; SKX-NEXT: kshiftrd $31, %k0, %k0
; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <32 x i8> %a, %b
%t2 = extractelement <32 x i1> %t1, i32 2
@@ -1525,6 +1543,7 @@ define zeroext i8 @test_extractelement_v
; SKX-NEXT: sete %al
; SKX-NEXT: addb $3, %al
; SKX-NEXT: movzbl %al, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <64 x i8> %a, %b
%t2 = extractelement <64 x i1> %t1, i32 63
@@ -1555,6 +1574,7 @@ define zeroext i8 @extractelement_v64i1_
; SKX-NEXT: sete %al
; SKX-NEXT: addb $3, %al
; SKX-NEXT: movzbl %al, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <64 x i8> %a, %b
%t2 = extractelement <64 x i1> %t1, i32 63
@@ -1622,6 +1642,7 @@ define i64 @test_extractelement_variable
; SKX-NEXT: movq (%rsp,%rdi,8), %rax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <4 x i64> %t1, i32 %index
ret i64 %t2
@@ -1666,6 +1687,7 @@ define i64 @test_extractelement_variable
; SKX-NEXT: movq (%rsp,%rdi,8), %rax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <8 x i64> %t1, i32 %index
ret i64 %t2
@@ -1730,6 +1752,7 @@ define double @test_extractelement_varia
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <4 x double> %t1, i32 %index
ret double %t2
@@ -1774,6 +1797,7 @@ define double @test_extractelement_varia
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <8 x double> %t1, i32 %index
ret double %t2
@@ -1838,6 +1862,7 @@ define i32 @test_extractelement_variable
; SKX-NEXT: movl (%rsp,%rdi,4), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <8 x i32> %t1, i32 %index
ret i32 %t2
@@ -1882,6 +1907,7 @@ define i32 @test_extractelement_variable
; SKX-NEXT: movl (%rsp,%rdi,4), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <16 x i32> %t1, i32 %index
ret i32 %t2
@@ -1946,6 +1972,7 @@ define float @test_extractelement_variab
; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <8 x float> %t1, i32 %index
ret float %t2
@@ -1990,6 +2017,7 @@ define float @test_extractelement_variab
; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <16 x float> %t1, i32 %index
ret float %t2
@@ -2054,6 +2082,7 @@ define i16 @test_extractelement_variable
; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <16 x i16> %t1, i32 %index
ret i16 %t2
@@ -2099,6 +2128,7 @@ define i16 @test_extractelement_variable
; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <32 x i16> %t1, i32 %index
ret i16 %t2
@@ -2167,6 +2197,7 @@ define i8 @test_extractelement_variable_
; SKX-NEXT: movb (%rdi,%rax), %al
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <32 x i8> %t1, i32 %index
@@ -2215,6 +2246,7 @@ define i8 @test_extractelement_variable_
; SKX-NEXT: movb (%rdi,%rax), %al
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t2 = extractelement <64 x i8> %t1, i32 %index
@@ -2265,6 +2297,7 @@ define i8 @test_extractelement_variable_
; SKX-NEXT: movb (%rax,%rcx), %al
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%i = add i8 %index, %index
@@ -2379,6 +2412,7 @@ define zeroext i8 @test_extractelement_v
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <8 x i32> %a, %b
%t2 = extractelement <8 x i1> %t1, i32 %index
@@ -2431,6 +2465,7 @@ define zeroext i8 @test_extractelement_v
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <16 x i32> %a, %b
%t2 = extractelement <16 x i1> %t1, i32 %index
@@ -2487,6 +2522,7 @@ define zeroext i8 @test_extractelement_v
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <32 x i8> %a, %b
%t2 = extractelement <32 x i1> %t1, i32 %index
Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract_i1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract_i1.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract_i1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract_i1.ll Fri Mar 3 03:03:24 2017
@@ -27,6 +27,7 @@ define zeroext i8 @test_extractelement_v
; SKX-NEXT: movzbl %al, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <64 x i8> %a, %b
%t2 = extractelement <64 x i1> %t1, i32 %index
Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Fri Mar 3 03:03:24 2017
@@ -168,14 +168,24 @@ define i8 @shuf_test1(i16 %v) nounwind {
}
define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kshiftlw $10, %k0, %k0
-; CHECK-NEXT: kshiftrw $15, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retq
+; KNL-LABEL: zext_test1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; KNL-NEXT: kshiftlw $10, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_test1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT: kshiftlw $10, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i32
@@ -183,15 +193,26 @@ define i32 @zext_test1(<16 x i32> %a, <1
}
define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kshiftlw $10, %k0, %k0
-; CHECK-NEXT: kshiftrw $15, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT: retq
+; KNL-LABEL: zext_test2:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; KNL-NEXT: kshiftlw $10, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_test2:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT: kshiftlw $10, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i16
@@ -199,15 +220,26 @@ define i16 @zext_test2(<16 x i32> %a, <1
}
define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test3:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kshiftlw $10, %k0, %k0
-; CHECK-NEXT: kshiftrw $15, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq
+; KNL-LABEL: zext_test3:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; KNL-NEXT: kshiftlw $10, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_test3:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT: kshiftlw $10, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i8
@@ -257,6 +289,7 @@ define <4 x i32> @test4(<4 x i64> %x, <4
; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1
; SKX-NEXT: kandnw %k0, %k1, %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x_gt_y = icmp sgt <4 x i64> %x, %y
%x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1
@@ -356,10 +389,12 @@ define <16 x i8> @test8(<16 x i32>%a, <1
; SKX-NEXT: ## BB#2:
; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB17_1:
; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%cond = icmp sgt i32 %a1, %b1
%cmp1 = icmp sgt <16 x i32> %a, zeroinitializer
@@ -660,6 +695,7 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; SKX-NEXT: kshiftlb $7, %k2, %k1
; SKX-NEXT: korb %k1, %k0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%b1 = bitcast i16 %y to <16 x i1>
@@ -983,9 +1019,11 @@ define void @ktest_1(<8 x double> %in, d
; SKX-NEXT: je LBB41_2
; SKX-NEXT: ## BB#1: ## %L1
; SKX-NEXT: vmovapd %zmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB41_2: ## %L2
; SKX-NEXT: vmovapd %zmm0, 8(%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%addr1 = getelementptr double, double * %base, i64 0
%addr2 = getelementptr double, double * %base, i64 1
@@ -1332,10 +1370,12 @@ define void @ktest_2(<32 x float> %in, f
; SKX-NEXT: ## BB#1: ## %L1
; SKX-NEXT: vmovaps %zmm0, (%rdi)
; SKX-NEXT: vmovaps %zmm1, 64(%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
; SKX-NEXT: LBB42_2: ## %L2
; SKX-NEXT: vmovaps %zmm0, 4(%rdi)
; SKX-NEXT: vmovaps %zmm1, 68(%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%addr1 = getelementptr float, float * %base, i64 0
%addr2 = getelementptr float, float * %base, i64 1
@@ -1567,6 +1607,7 @@ define void @store_32i1(<32 x i1>* %a, <
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k0
; SKX-NEXT: kmovd %k0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
store <32 x i1> %v, <32 x i1>* %a
ret void
@@ -1594,6 +1635,7 @@ define void @store_32i1_1(<32 x i1>* %a,
; SKX-NEXT: vpsllw $15, %zmm0, %zmm0
; SKX-NEXT: vpmovw2m %zmm0, %k0
; SKX-NEXT: kmovd %k0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%v1 = trunc <32 x i16> %v to <32 x i1>
store <32 x i1> %v1, <32 x i1>* %a
@@ -1928,6 +1970,7 @@ define void @store_64i1(<64 x i1>* %a, <
; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
; SKX-NEXT: kmovq %k0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
store <64 x i1> %v, <64 x i1>* %a
ret void
@@ -1949,6 +1992,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x
; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; SKX-NEXT: kmovb %k0, %eax
; SKX-NEXT: addl %eax, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%v1 = icmp eq <16 x i32> %a, zeroinitializer
%mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1959,13 +2003,22 @@ define i32 @test_bitcast_v8i1_zext(<16 x
}
define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
-; CHECK-LABEL: test_bitcast_v16i1_zext:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: addl %eax, %eax
-; CHECK-NEXT: retq
+; KNL-LABEL: test_bitcast_v16i1_zext:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: addl %eax, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_bitcast_v16i1_zext:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: addl %eax, %eax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%v1 = icmp eq <16 x i32> %a, zeroinitializer
%mask1 = bitcast <16 x i1> %v1 to i16
%val = zext i16 %mask1 to i32
Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-spills.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-spills.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-spills.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-spills.ll Fri Mar 3 03:03:24 2017
@@ -37,6 +37,7 @@ define <8 x i1> @test_8i1(<8 x i32> %a,
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
@@ -62,6 +63,7 @@ define <16 x i1> @test_16i1(<16 x i32> %
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
@@ -86,6 +88,7 @@ define <32 x i1> @test_32i1(<32 x i16> %
; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload
@@ -110,6 +113,7 @@ define <64 x i1> @test_64i1(<64 x i8> %a
; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq _f
; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
Modified: llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-masked-memop-64-32.ll Fri Mar 3 03:03:24 2017
@@ -32,6 +32,7 @@ define void @test3(<16 x i32> %trigger,
; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
@@ -56,6 +57,7 @@ define void @test13(<16 x i32> %trigger,
; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1}
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
@@ -67,6 +69,7 @@ define void @one_mask_bit_set5(<8 x doub
; AVX512: ## BB#0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vmovlps %xmm0, 48(%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
ret void
@@ -143,6 +146,7 @@ define void @test_store_16i64(<16 x i64>
; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
; AVX512F-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_store_16i64:
@@ -152,6 +156,7 @@ define void @test_store_16i64(<16 x i64>
; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
ret void
@@ -167,6 +172,7 @@ define void @test_store_16f64(<16 x doub
; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: kshiftrw $8, %k1, %k1
; AVX512F-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_store_16f64:
@@ -176,6 +182,7 @@ define void @test_store_16f64(<16 x doub
; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1}
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
ret void
Modified: llvm/trunk/test/CodeGen/X86/avx512-masked_memop-16-8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-masked_memop-16-8.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-masked_memop-16-8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-masked_memop-16-8.ll Fri Mar 3 03:03:24 2017
@@ -93,6 +93,7 @@ define void @test_mask_store_32xi8(<32 x
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %ymm0, %k1
; CHECK-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
ret void
@@ -105,6 +106,7 @@ define void @test_mask_store_64xi8(<64 x
; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0
; CHECK-NEXT: vpmovb2m %zmm0, %k1
; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask)
ret void
@@ -129,6 +131,7 @@ define void @test_mask_store_16xi16(<16
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
ret void
@@ -141,6 +144,7 @@ define void @test_mask_store_32xi16(<32
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %ymm0, %k1
; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask)
ret void
Modified: llvm/trunk/test/CodeGen/X86/avx512-skx-insert-subvec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-skx-insert-subvec.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-skx-insert-subvec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-skx-insert-subvec.ll Fri Mar 3 03:03:24 2017
@@ -35,6 +35,7 @@ define <8 x i1> @test2(<2 x i1> %a) {
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; CHECK-NEXT: vpmovq2m %zmm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <2 x i1> %a, <2 x i1> zeroinitializer, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef>
ret <8 x i1> %res
Modified: llvm/trunk/test/CodeGen/X86/avx512-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-trunc.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-trunc.ll Fri Mar 3 03:03:24 2017
@@ -8,6 +8,7 @@ define <16 x i8> @trunc_16x32_to_16x8(<1
; ALL-LABEL: trunc_16x32_to_16x8:
; ALL: ## BB#0:
; ALL-NEXT: vpmovdb %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <16 x i32> %i to <16 x i8>
ret <16 x i8> %x
@@ -17,6 +18,7 @@ define <8 x i16> @trunc_8x64_to_8x16(<8
; ALL-LABEL: trunc_8x64_to_8x16:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqw %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i16>
ret <8 x i16> %x
@@ -35,6 +37,7 @@ define <8 x i8> @trunc_qb_512(<8 x i64>
; ALL-LABEL: trunc_qb_512:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqw %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i8>
ret <8 x i8> %x
@@ -44,6 +47,7 @@ define void @trunc_qb_512_mem(<8 x i64>
; ALL-LABEL: trunc_qb_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqb %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i8>
store <8 x i8> %x, <8 x i8>* %res
@@ -56,11 +60,13 @@ define <4 x i8> @trunc_qb_256(<4 x i64>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qb_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqd %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i8>
ret <4 x i8> %x
@@ -73,11 +79,13 @@ define void @trunc_qb_256_mem(<4 x i64>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovd %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qb_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqb %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i8>
store <4 x i8> %x, <4 x i8>* %res
@@ -112,6 +120,7 @@ define <8 x i16> @trunc_qw_512(<8 x i64>
; ALL-LABEL: trunc_qw_512:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqw %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i16>
ret <8 x i16> %x
@@ -121,6 +130,7 @@ define void @trunc_qw_512_mem(<8 x i64>
; ALL-LABEL: trunc_qw_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqw %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i16>
store <8 x i16> %x, <8 x i16>* %res
@@ -133,11 +143,13 @@ define <4 x i16> @trunc_qw_256(<4 x i64>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qw_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqd %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i16>
ret <4 x i16> %x
@@ -150,11 +162,13 @@ define void @trunc_qw_256_mem(<4 x i64>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; KNL-NEXT: vmovq %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qw_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqw %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i16>
store <4 x i16> %x, <4 x i16>* %res
@@ -199,6 +213,7 @@ define void @trunc_qd_512_mem(<8 x i64>
; ALL-LABEL: trunc_qd_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovqd %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i32>
store <8 x i32> %x, <8 x i32>* %res
@@ -211,11 +226,13 @@ define <4 x i32> @trunc_qd_256(<4 x i64>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qd_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqd %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i32>
ret <4 x i32> %x
@@ -227,11 +244,13 @@ define void @trunc_qd_256_mem(<4 x i64>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vmovdqa %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qd_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovqd %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <4 x i64> %i to <4 x i32>
store <4 x i32> %x, <4 x i32>* %res
@@ -266,6 +285,7 @@ define <16 x i8> @trunc_db_512(<16 x i32
; ALL-LABEL: trunc_db_512:
; ALL: ## BB#0:
; ALL-NEXT: vpmovdb %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <16 x i32> %i to <16 x i8>
ret <16 x i8> %x
@@ -275,6 +295,7 @@ define void @trunc_db_512_mem(<16 x i32>
; ALL-LABEL: trunc_db_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovdb %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <16 x i32> %i to <16 x i8>
store <16 x i8> %x, <16 x i8>* %res
@@ -287,11 +308,13 @@ define <8 x i8> @trunc_db_256(<8 x i32>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_db_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovdw %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <8 x i32> %i to <8 x i8>
ret <8 x i8> %x
@@ -304,11 +327,13 @@ define void @trunc_db_256_mem(<8 x i32>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovq %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_db_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovdb %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <8 x i32> %i to <8 x i8>
store <8 x i8> %x, <8 x i8>* %res
@@ -352,6 +377,7 @@ define void @trunc_dw_512_mem(<16 x i32>
; ALL-LABEL: trunc_dw_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovdw %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x = trunc <16 x i32> %i to <16 x i16>
store <16 x i16> %x, <16 x i16>* %res
@@ -364,11 +390,13 @@ define <8 x i16> @trunc_dw_256(<8 x i32>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_dw_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovdw %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <8 x i32> %i to <8 x i16>
ret <8 x i16> %x
@@ -380,11 +408,13 @@ define void @trunc_dw_256_mem(<8 x i32>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vmovdqa %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_dw_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovdw %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <8 x i32> %i to <8 x i16>
store <8 x i16> %x, <8 x i16>* %res
@@ -434,11 +464,13 @@ define void @trunc_wb_512_mem(<32 x i16>
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vmovdqa %ymm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_512_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovwb %zmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <32 x i16> %i to <32 x i8>
store <32 x i8> %x, <32 x i8>* %res
@@ -450,11 +482,13 @@ define <16 x i8> @trunc_wb_256(<16 x i16
; KNL: ## BB#0:
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovwb %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <16 x i16> %i to <16 x i8>
ret <16 x i8> %x
@@ -466,11 +500,13 @@ define void @trunc_wb_256_mem(<16 x i16>
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vmovdqa %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovwb %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x = trunc <16 x i16> %i to <16 x i8>
store <16 x i8> %x, <16 x i8>* %res
@@ -509,11 +545,13 @@ define void @usat_trunc_wb_256_mem(<16 x
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vmovdqu %xmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_wb_256_mem:
; SKX: ## BB#0:
; SKX-NEXT: vpmovuswb %ymm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -528,11 +566,13 @@ define <16 x i8> @usat_trunc_wb_256(<16
; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_wb_256:
; SKX: ## BB#0:
; SKX-NEXT: vpmovuswb %ymm0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -563,6 +603,7 @@ define void @usat_trunc_db_512_mem(<16 x
; ALL-LABEL: usat_trunc_db_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovusdb %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x3 = icmp ult <16 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -575,6 +616,7 @@ define void @usat_trunc_qb_512_mem(<8 x
; ALL-LABEL: usat_trunc_qb_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovusqb %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x3 = icmp ult <8 x i64> %i, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
%x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
@@ -587,6 +629,7 @@ define void @usat_trunc_qd_512_mem(<8 x
; ALL-LABEL: usat_trunc_qd_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovusqd %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x3 = icmp ult <8 x i64> %i, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -599,6 +642,7 @@ define void @usat_trunc_qw_512_mem(<8 x
; ALL-LABEL: usat_trunc_qw_512_mem:
; ALL: ## BB#0:
; ALL-NEXT: vpmovusqw %zmm0, (%rdi)
+; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%x3 = icmp ult <8 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
%x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
@@ -638,6 +682,7 @@ define void @usat_trunc_db_1024_mem(<32
; KNL-NEXT: vpmovusdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vmovdqu %ymm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_db_1024_mem:
@@ -649,6 +694,7 @@ define void @usat_trunc_db_1024_mem(<32
; SKX-NEXT: vpmovdw %zmm1, %ymm1
; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; SKX-NEXT: vpmovwb %zmm0, (%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -714,6 +760,7 @@ define <16 x i8> @usat_trunc_db_256(<8 x
; KNL-NEXT: vpminud %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_db_256:
@@ -721,6 +768,7 @@ define <16 x i8> @usat_trunc_db_256(<8 x
; SKX-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
; SKX-NEXT: vpmovdw %ymm0, %xmm0
; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%tmp1 = icmp ult <8 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
%tmp2 = select <8 x i1> %tmp1, <8 x i32> %x, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
Modified: llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll Fri Mar 3 03:03:24 2017
@@ -160,13 +160,22 @@ define <8 x i32> @test11_unsigned(<8 x i
}
define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
-; CHECK-LABEL: test12:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
-; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
-; CHECK-NEXT: kunpckbw %k0, %k1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; KNL-LABEL: test12:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; KNL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; KNL-NEXT: kunpckbw %k0, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test12:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; SKX-NEXT: kunpckbw %k0, %k1, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
%res = icmp eq <16 x i64> %a, %b
%res1 = bitcast <16 x i1> %res to i16
ret i16 %res1
@@ -326,6 +335,7 @@ define i32 @test12_v32i32(<32 x i32> %a,
; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
; SKX-NEXT: kunpckwd %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%res = icmp eq <32 x i32> %a, %b
%res1 = bitcast <32 x i1> %res to i32
@@ -637,6 +647,7 @@ define i64 @test12_v64i16(<64 x i16> %a,
; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
; SKX-NEXT: kunpckdq %k0, %k1, %k0
; SKX-NEXT: kmovq %k0, %rax
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%res = icmp eq <64 x i16> %a, %b
%res1 = bitcast <64 x i1> %res to i64
@@ -892,6 +903,7 @@ define <16 x i8>@test29(<16 x i32> %x, <
; SKX-NEXT: vpcmpgtd %zmm3, %zmm2, %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x_gt_y = icmp sgt <16 x i32> %x, %y
%x1_gt_y1 = icmp sgt <16 x i32> %x1, %y1
Modified: llvm/trunk/test/CodeGen/X86/combine-testm-and.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-testm-and.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-testm-and.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-testm-and.ll Fri Mar 3 03:03:24 2017
@@ -6,6 +6,7 @@ define i32 @combineTESTM_AND_1(<8 x i64>
; CHECK: ## BB#0:
; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%and.i = and <8 x i64> %b, %a
%test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 -1)
@@ -19,6 +20,7 @@ define i32 @combineTESTM_AND_2(<8 x i64>
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%and.i = and <8 x i64> %b, %a
%test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 %mask)
@@ -32,6 +34,7 @@ define i32 @combineTESTM_AND_mask_3(<8 x
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %bptr
%and.i = and <8 x i64> %a, %b
@@ -46,6 +49,7 @@ define i32 @combineTESTM_AND_mask_4(<8 x
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %bptr
%and.i = and <8 x i64> %b, %a
Modified: llvm/trunk/test/CodeGen/X86/compress_expand.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/compress_expand.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/compress_expand.ll (original)
+++ llvm/trunk/test/CodeGen/X86/compress_expand.ll Fri Mar 3 03:03:24 2017
@@ -94,12 +94,20 @@ declare <4 x float> @llvm.masked.expand
declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
define void @test6(float* %base, <16 x float> %V) {
-; ALL-LABEL: test6:
-; ALL: # BB#0:
-; ALL-NEXT: movw $-2049, %ax # imm = 0xF7FF
-; ALL-NEXT: kmovw %eax, %k1
-; ALL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
-; ALL-NEXT: retq
+; SKX-LABEL: test6:
+; SKX: # BB#0:
+; SKX-NEXT: movw $-2049, %ax # imm = 0xF7FF
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vcompressps %zmm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test6:
+; KNL: # BB#0:
+; KNL-NEXT: movw $-2049, %ax # imm = 0xF7FF
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
+; KNL-NEXT: retq
call void @llvm.masked.compressstore.v16f32(<16 x float> %V, float* %base, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true>)
ret void
}
@@ -110,6 +118,7 @@ define void @test7(float* %base, <8 x fl
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vcompressps %ymm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; KNL-LABEL: test7:
@@ -132,6 +141,7 @@ define void @test8(double* %base, <8 x d
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; KNL-LABEL: test8:
@@ -151,6 +161,7 @@ define void @test9(i64* %base, <8 x i64>
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; KNL-LABEL: test9:
@@ -170,6 +181,7 @@ define void @test10(i64* %base, <4 x i64
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vpcompressq %ymm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; KNL-LABEL: test10:
@@ -340,16 +352,28 @@ define <16 x double> @test16(double* %ba
}
define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
-; ALL-LABEL: test17:
-; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm4, %zmm4, %zmm4
-; ALL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
-; ALL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
-; ALL-NEXT: kmovw %k2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1}
-; ALL-NEXT: vcompressps %zmm0, (%rdi) {%k2}
-; ALL-NEXT: retq
+; SKX-LABEL: test17:
+; SKX: # BB#0:
+; SKX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
+; SKX-NEXT: kmovw %k2, %eax
+; SKX-NEXT: popcntl %eax, %eax
+; SKX-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1}
+; SKX-NEXT: vcompressps %zmm0, (%rdi) {%k2}
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test17:
+; KNL: # BB#0:
+; KNL-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; KNL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; KNL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
+; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: popcntl %eax, %eax
+; KNL-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1}
+; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k2}
+; KNL-NEXT: retq
%mask = icmp eq <32 x i32> %trigger, zeroinitializer
call void @llvm.masked.compressstore.v32f32(<32 x float> %V, float* %base, <32 x i1> %mask)
ret void
@@ -365,6 +389,7 @@ define void @test18(double* %base, <16 x
; SKX-NEXT: popcntl %eax, %eax
; SKX-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; KNL-LABEL: test18:
Modified: llvm/trunk/test/CodeGen/X86/fast-isel-nontemporal.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fast-isel-nontemporal.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fast-isel-nontemporal.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fast-isel-nontemporal.ll Fri Mar 3 03:03:24 2017
@@ -379,6 +379,7 @@ define void @test_nt8xfloat(<8 x float>*
; AVX512-LABEL: test_nt8xfloat:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <8 x float> %X, <8 x float>* %ptr, align 32, !nontemporal !1
@@ -401,6 +402,7 @@ define void @test_nt4xdouble(<4 x double
; AVX512-LABEL: test_nt4xdouble:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntpd %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <4 x double> %X, <4 x double>* %ptr, align 32, !nontemporal !1
@@ -423,6 +425,7 @@ define void @test_nt32xi8(<32 x i8>* noc
; AVX512-LABEL: test_nt32xi8:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <32 x i8> %X, <32 x i8>* %ptr, align 32, !nontemporal !1
@@ -445,6 +448,7 @@ define void @test_nt16xi16(<16 x i16>* n
; AVX512-LABEL: test_nt16xi16:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <16 x i16> %X, <16 x i16>* %ptr, align 32, !nontemporal !1
@@ -467,6 +471,7 @@ define void @test_nt8xi32(<8 x i32>* noc
; AVX512-LABEL: test_nt8xi32:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <8 x i32> %X, <8 x i32>* %ptr, align 32, !nontemporal !1
@@ -489,6 +494,7 @@ define void @test_nt4xi64(<4 x i64>* noc
; AVX512-LABEL: test_nt4xi64:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <4 x i64> %X, <4 x i64>* %ptr, align 32, !nontemporal !1
@@ -750,6 +756,7 @@ define void @test_nt16xfloat(<16 x float
; AVX512-LABEL: test_nt16xfloat:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntps %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <16 x float> %X, <16 x float>* %ptr, align 64, !nontemporal !1
@@ -775,6 +782,7 @@ define void @test_nt8xdouble(<8 x double
; AVX512-LABEL: test_nt8xdouble:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntpd %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <8 x double> %X, <8 x double>* %ptr, align 64, !nontemporal !1
@@ -801,11 +809,13 @@ define void @test_nt64xi8(<64 x i8>* noc
; AVX512F: # BB#0: # %entry
; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_nt64xi8:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
store <64 x i8> %X, <64 x i8>* %ptr, align 64, !nontemporal !1
@@ -832,11 +842,13 @@ define void @test_nt32xi16(<32 x i16>* n
; AVX512F: # BB#0: # %entry
; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_nt32xi16:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
store <32 x i16> %X, <32 x i16>* %ptr, align 64, !nontemporal !1
@@ -862,6 +874,7 @@ define void @test_nt16xi32(<16 x i32>* n
; AVX512-LABEL: test_nt16xi32:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <16 x i32> %X, <16 x i32>* %ptr, align 64, !nontemporal !1
@@ -887,6 +900,7 @@ define void @test_nt8xi64(<8 x i64>* noc
; AVX512-LABEL: test_nt8xi64:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
store <8 x i64> %X, <8 x i64>* %ptr, align 64, !nontemporal !1
Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Fri Mar 3 03:03:24 2017
@@ -233,6 +233,7 @@ define void @test5(i32* %base, <16 x i32
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test5:
@@ -242,6 +243,7 @@ define void @test5(i32* %base, <16 x i32
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test5:
@@ -250,6 +252,7 @@ define void @test5(i32* %base, <16 x i32
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test5:
@@ -259,6 +262,7 @@ define void @test5(i32* %base, <16 x i32
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
@@ -790,6 +794,7 @@ define <4 x float> @test15(float* %base,
; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test15:
@@ -804,6 +809,7 @@ define <4 x float> @test15(float* %base,
; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test15:
@@ -900,6 +906,7 @@ define <2 x double> @test17(double* %bas
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovapd %xmm2, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test17:
@@ -913,6 +920,7 @@ define <2 x double> @test17(double* %bas
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovapd %xmm2, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test17:
@@ -956,6 +964,7 @@ define void @test18(<4 x i32>%a1, <4 x i
; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test18:
@@ -969,6 +978,7 @@ define void @test18(<4 x i32>%a1, <4 x i
; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test18:
@@ -976,6 +986,7 @@ define void @test18(<4 x i32>%a1, <4 x i
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test18:
@@ -1002,6 +1013,7 @@ define void @test19(<4 x double>%a1, dou
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test19:
@@ -1017,6 +1029,7 @@ define void @test19(<4 x double>%a1, dou
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test19:
@@ -1024,6 +1037,7 @@ define void @test19(<4 x double>%a1, dou
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test19:
@@ -1032,6 +1046,7 @@ define void @test19(<4 x double>%a1, dou
; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
%gep = getelementptr double, double* %ptr, <4 x i64> %ind
call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
@@ -1051,6 +1066,7 @@ define void @test20(<2 x float>%a1, <2 x
; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test20:
@@ -1064,6 +1080,7 @@ define void @test20(<2 x float>%a1, <2 x
; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test20:
@@ -1074,6 +1091,7 @@ define void @test20(<2 x float>%a1, <2 x
; SKX-NEXT: kshiftlb $6, %k0, %k0
; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test20:
@@ -1101,6 +1119,7 @@ define void @test21(<2 x i32>%a1, <2 x i
; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test21:
@@ -1112,6 +1131,7 @@ define void @test21(<2 x i32>%a1, <2 x i
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test21:
@@ -1123,6 +1143,7 @@ define void @test21(<2 x i32>%a1, <2 x i
; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test21:
@@ -1134,6 +1155,7 @@ define void @test21(<2 x i32>%a1, <2 x i
; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
ret void
@@ -1157,6 +1179,7 @@ define <2 x float> @test22(float* %base,
; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
; KNL_64-NEXT: vmovaps %xmm2, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22:
@@ -1172,6 +1195,7 @@ define <2 x float> @test22(float* %base,
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
; KNL_32-NEXT: vmovaps %xmm2, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test22:
@@ -1217,6 +1241,7 @@ define <2 x i32> @test23(i32* %base, <2
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23:
@@ -1230,6 +1255,7 @@ define <2 x i32> @test23(i32* %base, <2
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test23:
@@ -1262,6 +1288,7 @@ define <2 x i32> @test24(i32* %base, <2
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test24:
@@ -1274,6 +1301,7 @@ define <2 x i32> @test24(i32* %base, <2
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test24:
@@ -1308,6 +1336,7 @@ define <2 x i64> @test25(i64* %base, <2
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test25:
@@ -1321,6 +1350,7 @@ define <2 x i64> @test25(i64* %base, <2
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test25:
@@ -1355,6 +1385,7 @@ define <2 x i64> @test26(i64* %base, <2
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test26:
@@ -1368,6 +1399,7 @@ define <2 x i64> @test26(i64* %base, <2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test26:
@@ -1401,6 +1433,7 @@ define <2 x float> @test27(float* %base,
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test27:
@@ -1412,6 +1445,7 @@ define <2 x float> @test27(float* %base,
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test27:
@@ -1447,6 +1481,7 @@ define void @test28(<2 x i32>%a1, <2 x i
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test28:
@@ -1458,6 +1493,7 @@ define void @test28(<2 x i32>%a1, <2 x i
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test28:
@@ -1467,6 +1503,7 @@ define void @test28(<2 x i32>%a1, <2 x i
; SKX-NEXT: movb $3, %al
; SKX-NEXT: kmovb %eax, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test28:
@@ -1476,6 +1513,7 @@ define void @test28(<2 x i32>%a1, <2 x i
; SKX_32-NEXT: movb $3, %al
; SKX_32-NEXT: kmovb %eax, %k1
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
ret void
@@ -1848,6 +1886,7 @@ define void @test_scatter_16i32(<16 x i3
; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16i32:
@@ -1856,6 +1895,7 @@ define void @test_scatter_16i32(<16 x i3
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16i32:
@@ -1867,6 +1907,7 @@ define void @test_scatter_16i32(<16 x i3
; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16i32:
@@ -1875,6 +1916,7 @@ define void @test_scatter_16i32(<16 x i3
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
ret void
@@ -1888,6 +1930,7 @@ define void @test_scatter_16i64(<16 x i6
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16i64:
@@ -1912,6 +1955,7 @@ define void @test_scatter_16i64(<16 x i6
; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16i64:
@@ -1922,6 +1966,7 @@ define void @test_scatter_16i64(<16 x i6
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16i64:
@@ -1946,6 +1991,7 @@ define void @test_scatter_16i64(<16 x i6
; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
ret void
@@ -1961,6 +2007,7 @@ define void @test_scatter_16f32(<16 x fl
; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16f32:
@@ -1969,6 +2016,7 @@ define void @test_scatter_16f32(<16 x fl
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16f32:
@@ -1980,6 +2028,7 @@ define void @test_scatter_16f32(<16 x fl
; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16f32:
@@ -1988,6 +2037,7 @@ define void @test_scatter_16f32(<16 x fl
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
ret void
@@ -2002,6 +2052,7 @@ define void @test_scatter_16f64(<16 x do
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16f64:
@@ -2026,6 +2077,7 @@ define void @test_scatter_16f64(<16 x do
; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16f64:
@@ -2036,6 +2088,7 @@ define void @test_scatter_16f64(<16 x do
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16f64:
@@ -2060,6 +2113,7 @@ define void @test_scatter_16f64(<16 x do
; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
ret void
@@ -2082,6 +2136,34 @@ define <4 x i64> @test_pr28312(<4 x i64*
; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_64-NEXT: retq
;
+; KNL_32-LABEL: test_pr28312:
+; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebp
+; KNL_32-NEXT: .Lcfi12:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .Lcfi13:
+; KNL_32-NEXT: .cfi_offset %ebp, -8
+; KNL_32-NEXT: movl %esp, %ebp
+; KNL_32-NEXT: .Lcfi14:
+; KNL_32-NEXT: .cfi_def_cfa_register %ebp
+; KNL_32-NEXT: andl $-32, %esp
+; KNL_32-NEXT: subl $32, %esp
+; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
+; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
+; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
+; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
+; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; KNL_32-NEXT: movl %ebp, %esp
+; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: retl
+;
; SKX-LABEL: test_pr28312:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
@@ -2090,6 +2172,27 @@ define <4 x i64> @test_pr28312(<4 x i64*
; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_pr28312:
+; SKX_32: # BB#0:
+; SKX_32-NEXT: pushl %ebp
+; SKX_32-NEXT: .Lcfi13:
+; SKX_32-NEXT: .cfi_def_cfa_offset 8
+; SKX_32-NEXT: .Lcfi14:
+; SKX_32-NEXT: .cfi_offset %ebp, -8
+; SKX_32-NEXT: movl %esp, %ebp
+; SKX_32-NEXT: .Lcfi15:
+; SKX_32-NEXT: .cfi_def_cfa_register %ebp
+; SKX_32-NEXT: andl $-32, %esp
+; SKX_32-NEXT: subl $32, %esp
+; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
+; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
+; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
+; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; SKX_32-NEXT: movl %ebp, %esp
+; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: retl
%g1 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g2 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g3 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
Modified: llvm/trunk/test/CodeGen/X86/masked_memop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_memop.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_memop.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_memop.ll Fri Mar 3 03:03:24 2017
@@ -415,6 +415,7 @@ define void @test12(<8 x i32> %trigger,
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: test12:
@@ -422,6 +423,7 @@ define void @test12(<8 x i32> %trigger,
; SKX-NEXT: vpxor %ymm2, %ymm2, %ymm2
; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
@@ -998,12 +1000,14 @@ define void @one_mask_bit_set3(<4 x i64>
; AVX512F: ## BB#0:
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovlps %xmm0, 16(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; SKX-LABEL: one_mask_bit_set3:
; SKX: ## BB#0:
; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vmovq %xmm0, 16(%rdi)
+; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
@@ -1023,6 +1027,7 @@ define void @one_mask_bit_set4(<4 x doub
; AVX512: ## BB#0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
ret void
@@ -1042,6 +1047,7 @@ define void @one_mask_bit_set5(<8 x doub
; AVX512: ## BB#0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vmovlps %xmm0, 48(%rdi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
ret void
Modified: llvm/trunk/test/CodeGen/X86/nontemporal-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-2.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-2.ll Fri Mar 3 03:03:24 2017
@@ -255,6 +255,7 @@ define void @test_zero_v8f32(<8 x float>
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
ret void
@@ -279,6 +280,7 @@ define void @test_zero_v8i32(<8 x i32>*
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
ret void
@@ -303,6 +305,7 @@ define void @test_zero_v4f64(<4 x double
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
ret void
@@ -327,6 +330,7 @@ define void @test_zero_v4i64(<4 x i64>*
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
ret void
@@ -351,6 +355,7 @@ define void @test_zero_v16i16(<16 x i16>
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
ret void
@@ -375,6 +380,7 @@ define void @test_zero_v32i8(<32 x i8>*
; VLX: # BB#0:
; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
ret void
@@ -757,6 +763,7 @@ define void @test_arg_v8f32(<8 x float>
; VLX-LABEL: test_arg_v8f32:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
ret void
@@ -778,6 +785,7 @@ define void @test_arg_v8i32(<8 x i32> %a
; VLX-LABEL: test_arg_v8i32:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
ret void
@@ -799,6 +807,7 @@ define void @test_arg_v4f64(<4 x double>
; VLX-LABEL: test_arg_v4f64:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
ret void
@@ -820,6 +829,7 @@ define void @test_arg_v4i64(<4 x i64> %a
; VLX-LABEL: test_arg_v4i64:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
ret void
@@ -841,6 +851,7 @@ define void @test_arg_v16i16(<16 x i16>
; VLX-LABEL: test_arg_v16i16:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
ret void
@@ -862,6 +873,7 @@ define void @test_arg_v32i8(<32 x i8> %a
; VLX-LABEL: test_arg_v32i8:
; VLX: # BB#0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
ret void
@@ -1031,6 +1043,7 @@ define void @test_op_v8f32(<8 x float> %
; VLX: # BB#0:
; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = fadd <8 x float> %a, %b
store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
@@ -1068,6 +1081,7 @@ define void @test_op_v8i32(<8 x i32> %a,
; VLX: # BB#0:
; VLX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = add <8 x i32> %a, %b
store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
@@ -1094,6 +1108,7 @@ define void @test_op_v4f64(<4 x double>
; VLX: # BB#0:
; VLX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntpd %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = fadd <4 x double> %a, %b
store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
@@ -1131,6 +1146,7 @@ define void @test_op_v4i64(<4 x i64> %a,
; VLX: # BB#0:
; VLX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = add <4 x i64> %a, %b
store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
@@ -1168,6 +1184,7 @@ define void @test_op_v16i16(<16 x i16> %
; VLX: # BB#0:
; VLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = add <16 x i16> %a, %b
store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
@@ -1205,6 +1222,7 @@ define void @test_op_v32i8(<32 x i8> %a,
; VLX: # BB#0:
; VLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = add <32 x i8> %a, %b
store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
@@ -1235,6 +1253,7 @@ define void @test_unaligned_v8f32(<8 x f
; VLX: # BB#0:
; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovups %ymm0, (%rdi)
+; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = fadd <8 x float> %a, %b
store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
Modified: llvm/trunk/test/CodeGen/X86/pmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pmul.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pmul.ll Fri Mar 3 03:03:24 2017
@@ -55,6 +55,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v16i8c:
@@ -63,6 +64,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
%A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
@@ -195,6 +197,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i
; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v16i8:
@@ -204,6 +207,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i
; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
%A = mul <16 x i8> %i, %j
@@ -1168,24 +1172,15 @@ define <4 x i32> @mul_v4i64_zero_upper(<
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: mul_v4i64_zero_upper:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: mul_v4i64_zero_upper:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX512-NEXT: retq
+; AVX-LABEL: mul_v4i64_zero_upper:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
entry:
%val1a = zext <4 x i32> %val1 to <4 x i64>
%val2a = zext <4 x i32> %val2 to <4 x i64>
@@ -1237,30 +1232,18 @@ define <4 x i32> @mul_v4i64_zero_upper_l
; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; SSE41-NEXT: retq
;
-; AVX2-LABEL: mul_v4i64_zero_upper_left:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: mul_v4i64_zero_upper_left:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX512-NEXT: retq
+; AVX-LABEL: mul_v4i64_zero_upper_left:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
entry:
%val1a = zext <4 x i32> %val1 to <4 x i64>
%res64 = mul <4 x i64> %val1a, %val2
@@ -1301,26 +1284,16 @@ define <4 x i32> @mul_v4i64_zero_lower(<
; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: mul_v4i64_zero_lower:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: mul_v4i64_zero_lower:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX512-NEXT: retq
+; AVX-LABEL: mul_v4i64_zero_lower:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
entry:
%val1a = zext <4 x i32> %val1 to <4 x i64>
%val2a = and <4 x i64> %val2, <i64 -4294967296, i64 -4294967296, i64 -4294967296, i64 -4294967296>
Modified: llvm/trunk/test/CodeGen/X86/pr29112.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr29112.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr29112.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr29112.ll Fri Mar 3 03:03:24 2017
@@ -60,6 +60,7 @@ define <4 x float> @bar(<4 x float>* %a1
; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: vmovaps %xmm9, (%rsp)
; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
Modified: llvm/trunk/test/CodeGen/X86/pr30284.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr30284.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr30284.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr30284.ll Fri Mar 3 03:03:24 2017
@@ -11,6 +11,7 @@ define void @f_f___un_3C_unf_3E_un_3C_un
; CHECK-NEXT: vorpd %zmm2, %zmm1, %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm1, 64
; CHECK-NEXT: vmovapd %zmm0, 0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
%a_load22 = load <16 x i64>, <16 x i64>* null, align 1
%bitop = or <16 x i64> %a_load22, <i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736>
Modified: llvm/trunk/test/CodeGen/X86/sad.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sad.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sad.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sad.ll Fri Mar 3 03:03:24 2017
@@ -81,6 +81,7 @@ define i32 @sad_16i8() nounwind {
; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_16i8:
@@ -106,6 +107,7 @@ define i32 @sad_16i8() nounwind {
; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
br label %vector.body
@@ -324,6 +326,7 @@ define i32 @sad_32i8() nounwind {
; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_32i8:
@@ -351,6 +354,7 @@ define i32 @sad_32i8() nounwind {
; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
br label %vector.body
@@ -799,6 +803,7 @@ define i32 @sad_avx64i8() nounwind {
; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_avx64i8:
@@ -827,6 +832,7 @@ define i32 @sad_avx64i8() nounwind {
; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
br label %vector.body
@@ -1264,6 +1270,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>*
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_nonloop_32i8:
@@ -1275,6 +1282,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>*
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%v1 = load <32 x i8>, <32 x i8>* %p, align 1
%z1 = zext <32 x i8> %v1 to <32 x i32>
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll Fri Mar 3 03:03:24 2017
@@ -31,6 +31,7 @@ define void @shuffle_v32i8_to_v16i8(<32
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
@@ -42,6 +43,7 @@ define void @shuffle_v32i8_to_v16i8(<32
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
@@ -53,6 +55,7 @@ define void @shuffle_v32i8_to_v16i8(<32
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
@@ -64,6 +67,7 @@ define void @shuffle_v32i8_to_v16i8(<32
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -89,6 +93,7 @@ define void @trunc_v16i16_to_v16i8(<32 x
; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
@@ -96,6 +101,7 @@ define void @trunc_v16i16_to_v16i8(<32 x
; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
@@ -103,12 +109,14 @@ define void @trunc_v16i16_to_v16i8(<32 x
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%bc = bitcast <32 x i8> %vec to <16 x i16>
@@ -139,6 +147,7 @@ define void @shuffle_v16i16_to_v8i16(<16
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
@@ -153,6 +162,7 @@ define void @shuffle_v16i16_to_v8i16(<16
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
@@ -164,6 +174,7 @@ define void @shuffle_v16i16_to_v8i16(<16
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
@@ -178,6 +189,7 @@ define void @shuffle_v16i16_to_v8i16(<16
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -200,12 +212,14 @@ define void @trunc_v8i32_to_v8i16(<16 x
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
@@ -213,12 +227,14 @@ define void @trunc_v8i32_to_v8i16(<16 x
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%bc = bitcast <16 x i16> %vec to <8 x i32>
@@ -243,6 +259,7 @@ define void @shuffle_v8i32_to_v4i32(<8 x
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512F-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
@@ -251,6 +268,7 @@ define void @shuffle_v8i32_to_v4i32(<8 x
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512VL-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
@@ -259,6 +277,7 @@ define void @shuffle_v8i32_to_v4i32(<8 x
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
@@ -267,6 +286,7 @@ define void @shuffle_v8i32_to_v4i32(<8 x
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512BWVL-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %L
%strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -288,12 +308,14 @@ define void @trunc_v4i64_to_v4i32(<8 x i
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
@@ -301,12 +323,14 @@ define void @trunc_v4i64_to_v4i32(<8 x i
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %L
%bc = bitcast <8 x i32> %vec to <4 x i64>
@@ -337,6 +361,7 @@ define void @shuffle_v32i8_to_v8i8(<32 x
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
@@ -348,6 +373,7 @@ define void @shuffle_v32i8_to_v8i8(<32 x
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
@@ -359,6 +385,7 @@ define void @shuffle_v32i8_to_v8i8(<32 x
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
@@ -373,6 +400,7 @@ define void @shuffle_v32i8_to_v8i8(<32 x
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -397,12 +425,14 @@ define void @trunc_v8i32_to_v8i8(<32 x i
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
@@ -411,12 +441,14 @@ define void @trunc_v8i32_to_v8i8(<32 x i
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%bc = bitcast <32 x i8> %vec to <8 x i32>
@@ -449,6 +481,7 @@ define void @shuffle_v16i16_to_v4i16(<16
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
@@ -457,6 +490,7 @@ define void @shuffle_v16i16_to_v4i16(<16
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
@@ -469,6 +503,7 @@ define void @shuffle_v16i16_to_v4i16(<16
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
@@ -477,6 +512,7 @@ define void @shuffle_v16i16_to_v4i16(<16
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -500,12 +536,14 @@ define void @trunc_v4i64_to_v4i16(<16 x
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
@@ -514,12 +552,14 @@ define void @trunc_v4i64_to_v4i16(<16 x
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%bc = bitcast <16 x i16> %vec to <4 x i64>
@@ -550,6 +590,7 @@ define void @shuffle_v32i8_to_v4i8(<32 x
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
@@ -558,6 +599,7 @@ define void @shuffle_v32i8_to_v4i8(<32 x
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
@@ -569,6 +611,7 @@ define void @shuffle_v32i8_to_v4i8(<32 x
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
@@ -577,6 +620,7 @@ define void @shuffle_v32i8_to_v4i8(<32 x
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
@@ -600,12 +644,14 @@ define void @trunc_v4i64_to_v4i8(<32 x i
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
@@ -614,12 +660,14 @@ define void @trunc_v4i64_to_v4i8(<32 x i
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%bc = bitcast <32 x i8> %vec to <4 x i64>
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll Fri Mar 3 03:03:24 2017
@@ -18,6 +18,7 @@ define void @shuffle_v64i8_to_v32i8(<64
; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
@@ -29,6 +30,7 @@ define void @shuffle_v64i8_to_v32i8(<64
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
@@ -40,6 +42,7 @@ define void @shuffle_v64i8_to_v32i8(<64
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
@@ -51,6 +54,7 @@ define void @shuffle_v64i8_to_v32i8(<64
; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
@@ -67,6 +71,7 @@ define void @trunc_v32i16_to_v32i8(<64 x
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
@@ -77,18 +82,21 @@ define void @trunc_v32i16_to_v32i8(<64 x
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%bc = bitcast <64 x i8> %vec to <32 x i16>
@@ -107,6 +115,7 @@ define void @shuffle_v32i16_to_v16i16(<3
; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
@@ -118,6 +127,7 @@ define void @shuffle_v32i16_to_v16i16(<3
; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
@@ -131,6 +141,7 @@ define void @shuffle_v32i16_to_v16i16(<3
; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
@@ -141,6 +152,7 @@ define void @shuffle_v32i16_to_v16i16(<3
; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -153,6 +165,7 @@ define void @trunc_v16i32_to_v16i16(<32
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%bc = bitcast <32 x i16> %vec to <16 x i32>
@@ -169,6 +182,7 @@ define void @shuffle_v16i32_to_v8i32(<16
; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
%strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -181,6 +195,7 @@ define void @trunc_v8i64_to_v8i32(<16 x
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
%bc = bitcast <16 x i32> %vec to <8 x i64>
@@ -206,6 +221,7 @@ define void @shuffle_v64i8_to_v16i8(<64
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
@@ -224,6 +240,7 @@ define void @shuffle_v64i8_to_v16i8(<64
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
@@ -265,6 +282,7 @@ define void @shuffle_v64i8_to_v16i8(<64
; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
@@ -306,6 +324,7 @@ define void @shuffle_v64i8_to_v16i8(<64
; AVX512BWVL-NEXT: vpextrb $12, %xmm0, %eax
; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
@@ -318,6 +337,7 @@ define void @trunc_v16i32_to_v16i8(<64 x
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%bc = bitcast <64 x i8> %vec to <16 x i32>
@@ -345,6 +365,7 @@ define void @shuffle_v32i16_to_v8i16(<32
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
@@ -365,6 +386,7 @@ define void @shuffle_v32i16_to_v8i16(<32
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
@@ -388,6 +410,7 @@ define void @shuffle_v32i16_to_v8i16(<32
; AVX512BW-NEXT: vpextrw $4, %xmm0, %eax
; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
@@ -411,6 +434,7 @@ define void @shuffle_v32i16_to_v8i16(<32
; AVX512BWVL-NEXT: vpextrw $4, %xmm0, %eax
; AVX512BWVL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -423,6 +447,7 @@ define void @trunc_v8i64_to_v8i16(<32 x
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%bc = bitcast <32 x i16> %vec to <8 x i64>
@@ -448,6 +473,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
@@ -466,6 +492,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
@@ -491,6 +518,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x
; AVX512BW-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
; AVX512BW-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
@@ -516,6 +544,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x
; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax
; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
@@ -528,6 +557,7 @@ define void @trunc_v8i64_to_v8i8(<64 x i
; AVX512: # BB#0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%bc = bitcast <64 x i8> %vec to <8 x i64>
Modified: llvm/trunk/test/CodeGen/X86/sse-fsignum.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-fsignum.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-fsignum.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse-fsignum.ll Fri Mar 3 03:03:24 2017
@@ -102,6 +102,7 @@ define void @signum32b(<8 x float>*) {
; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vmovaps %ymm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
entry:
%1 = load <8 x float>, <8 x float>* %0
@@ -161,6 +162,7 @@ define void @signum64b(<4 x double>*) {
; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512F-NEXT: vsubpd %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vmovapd %ymm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
entry:
%1 = load <4 x double>, <4 x double>* %0
@@ -178,43 +180,18 @@ entry:
;
define void @signum32c(<8 x float>*) {
-; AVX1-LABEL: signum32c:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
-; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: vsubps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vmovaps %ymm0, (%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: signum32c:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovaps (%rdi), %ymm0
-; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: vsubps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vmovaps %ymm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: signum32c:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: vmovaps (%rdi), %ymm0
-; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX512F-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vmovaps %ymm0, (%rdi)
-; AVX512F-NEXT: retq
+; AVX-LABEL: signum32c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovaps (%rdi), %ymm0
+; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vsubps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vmovaps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
entry:
%1 = load <8 x float>, <8 x float>* %0
%2 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %1, <8 x float> zeroinitializer, i8 1)
@@ -270,6 +247,7 @@ define void @signum64c(<4 x double>*) {
; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512F-NEXT: vmovaps %ymm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
entry:
%x = load <4 x double>, <4 x double>* %0
Modified: llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll Fri Mar 3 03:03:24 2017
@@ -1341,6 +1341,7 @@ define void @fallback_broadcast_v4i64_to
; X32-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
; X32-AVX512-NEXT: vmovdqu %ymm0, _ga4
; X32-AVX512-NEXT: vmovdqu64 %zmm1, _gb4
+; X32-AVX512-NEXT: vzeroupper
; X32-AVX512-NEXT: retl
;
; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
@@ -1391,6 +1392,7 @@ define void @fallback_broadcast_v4i64_to
; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
; X64-AVX512-NEXT: vmovdqu %ymm0, {{.*}}(%rip)
; X64-AVX512-NEXT: vmovdqu64 %zmm1, {{.*}}(%rip)
+; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
entry:
%0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
@@ -1429,6 +1431,7 @@ define void @fallback_broadcast_v4f64_to
; X32-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
; X32-AVX512-NEXT: vmovupd %ymm0, _ga2
; X32-AVX512-NEXT: vmovupd %zmm1, _gb2
+; X32-AVX512-NEXT: vzeroupper
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
@@ -1454,6 +1457,7 @@ define void @fallback_broadcast_v4f64_to
; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
; X64-AVX512-NEXT: vmovupd %ymm0, {{.*}}(%rip)
; X64-AVX512-NEXT: vmovupd %zmm1, {{.*}}(%rip)
+; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
entry:
%0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
Modified: llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_fp_to_int.ll Fri Mar 3 03:03:24 2017
@@ -63,6 +63,7 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64:
@@ -112,18 +113,12 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
-; VEX-LABEL: fptosi_4f64_to_2i32:
-; VEX: # BB#0:
-; VEX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VEX-NEXT: vzeroupper
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: fptosi_4f64_to_2i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: fptosi_4f64_to_2i32:
+; AVX: # BB#0:
+; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%cvt = fptosi <4 x double> %ext to <4 x i32>
ret <4 x i32> %cvt
@@ -243,16 +238,11 @@ define <4 x i32> @fptosi_4f64_to_4i32(<4
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
-; VEX-LABEL: fptosi_4f64_to_4i32:
-; VEX: # BB#0:
-; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VEX-NEXT: vzeroupper
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: fptosi_4f64_to_4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: fptosi_4f64_to_4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%cvt = fptosi <4 x double> %a to <4 x i32>
ret <4 x i32> %cvt
}
@@ -334,6 +324,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64:
@@ -400,6 +391,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f64_to_4i32:
@@ -412,6 +404,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32:
@@ -477,6 +470,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f64_to_2i32:
@@ -489,6 +483,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32:
@@ -550,12 +545,14 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f64_to_2i32:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f64_to_2i32:
@@ -563,12 +560,14 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%cvt = fptoui <4 x double> %ext to <4 x i32>
@@ -816,11 +815,13 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f64_to_4i32:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f64_to_4i32:
@@ -828,11 +829,13 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <4 x double> %a to <4 x i32>
ret <4 x i32> %cvt
@@ -980,12 +983,14 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = fptosi <4 x float> %a to <4 x i64>
%shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -1281,6 +1286,7 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f32_to_2i32:
@@ -1294,6 +1300,7 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32:
@@ -1347,6 +1354,7 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f32_to_4i32:
@@ -1359,6 +1367,7 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32:
@@ -1529,12 +1538,14 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <4 x float> %a to <4 x i64>
%shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -2291,6 +2302,7 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2
; AVX512F-NEXT: vmovq %rax, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptosi_2f16_to_4i32:
@@ -2321,6 +2333,7 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2
; AVX512DQ-NEXT: vmovq %rax, %xmm0
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32:
Modified: llvm/trunk/test/CodeGen/X86/vec_fpext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_fpext.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_fpext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_fpext.ll Fri Mar 3 03:03:24 2017
@@ -82,6 +82,7 @@ define void @fpext_frommem4(<4 x float>*
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x01]
; X32-AVX512VL-NEXT: vmovups %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x00]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-SSE-LABEL: fpext_frommem4:
@@ -103,6 +104,7 @@ define void @fpext_frommem4(<4 x float>*
; X64-AVX512VL: # BB#0: # %entry
; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x07]
; X64-AVX512VL-NEXT: vmovups %ymm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x06]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = load <4 x float>, <4 x float>* %in
@@ -143,6 +145,7 @@ define void @fpext_frommem8(<8 x float>*
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x01]
; X32-AVX512VL-NEXT: vmovups %zmm0, (%eax) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x00]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-SSE-LABEL: fpext_frommem8:
@@ -170,6 +173,7 @@ define void @fpext_frommem8(<8 x float>*
; X64-AVX512VL: # BB#0: # %entry
; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x07]
; X64-AVX512VL-NEXT: vmovups %zmm0, (%rsi) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x06]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = load <8 x float>, <8 x float>* %in
Modified: llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll Fri Mar 3 03:03:24 2017
@@ -61,6 +61,7 @@ define <2 x double> @sitofp_2i64_to_2f64
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64:
@@ -92,18 +93,12 @@ define <2 x double> @sitofp_4i32_to_2f64
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: sitofp_4i32_to_2f64:
-; VEX: # BB#0:
-; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; VEX-NEXT: vzeroupper
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: sitofp_4i32_to_2f64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512-NEXT: retq
+; AVX-LABEL: sitofp_4i32_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%cvt = sitofp <4 x i32> %a to <4 x double>
%shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
ret <2 x double> %shuf
@@ -156,6 +151,7 @@ define <2 x double> @sitofp_8i16_to_2f64
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x double>
%shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -211,6 +207,7 @@ define <2 x double> @sitofp_16i8_to_2f64
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x double>
%shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -498,6 +495,7 @@ define <2 x double> @uitofp_2i64_to_2f64
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64:
@@ -536,6 +534,7 @@ define <2 x double> @uitofp_2i32_to_2f64
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_2i32_to_2f64:
@@ -548,6 +547,7 @@ define <2 x double> @uitofp_2i32_to_2f64
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64:
@@ -603,12 +603,14 @@ define <2 x double> @uitofp_4i32_to_2f64
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i32_to_2f64:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i32_to_2f64:
@@ -616,12 +618,14 @@ define <2 x double> @uitofp_4i32_to_2f64
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <4 x i32> %a to <4 x double>
%shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -675,6 +679,7 @@ define <2 x double> @uitofp_8i16_to_2f64
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x double>
%shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -730,6 +735,7 @@ define <2 x double> @uitofp_16i8_to_2f64
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x double>
%shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -1089,6 +1095,7 @@ define <4 x float> @sitofp_2i64_to_4f32(
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32:
@@ -1147,6 +1154,7 @@ define <4 x float> @sitofp_2i64_to_4f32_
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero:
@@ -1212,12 +1220,14 @@ define <4 x float> @sitofp_4i64_to_4f32_
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%cvt = sitofp <4 x i64> %ext to <4 x float>
@@ -1288,6 +1298,7 @@ define <4 x float> @sitofp_8i16_to_4f32(
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x float>
%shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1346,6 +1357,7 @@ define <4 x float> @sitofp_16i8_to_4f32(
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x float>
%shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1421,6 +1433,7 @@ define <4 x float> @sitofp_4i64_to_4f32(
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_4i64_to_4f32:
@@ -1437,6 +1450,7 @@ define <4 x float> @sitofp_4i64_to_4f32(
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_4i64_to_4f32:
@@ -1444,11 +1458,13 @@ define <4 x float> @sitofp_4i64_to_4f32(
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %cvt
@@ -1697,6 +1713,7 @@ define <4 x float> @uitofp_2i64_to_4f32(
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32:
@@ -1805,6 +1822,7 @@ define <4 x float> @uitofp_2i64_to_2f32(
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32:
@@ -1932,12 +1950,14 @@ define <4 x float> @uitofp_4i64_to_4f32_
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%cvt = uitofp <4 x i64> %ext to <4 x float>
@@ -1982,6 +2002,7 @@ define <4 x float> @uitofp_4i32_to_4f32(
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i32_to_4f32:
@@ -1994,6 +2015,7 @@ define <4 x float> @uitofp_4i32_to_4f32(
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32:
@@ -2054,6 +2076,7 @@ define <4 x float> @uitofp_8i16_to_4f32(
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x float>
%shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2112,6 +2135,7 @@ define <4 x float> @uitofp_16i8_to_4f32(
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x float>
%shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2335,6 +2359,7 @@ define <4 x float> @uitofp_4i64_to_4f32(
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i64_to_4f32:
@@ -2351,6 +2376,7 @@ define <4 x float> @uitofp_4i64_to_4f32(
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i64_to_4f32:
@@ -2358,11 +2384,13 @@ define <4 x float> @uitofp_4i64_to_4f32(
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32:
; AVX512VLDQ: # BB#0:
; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %cvt
@@ -2607,6 +2635,7 @@ define <2 x double> @sitofp_load_2i64_to
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64:
@@ -2926,6 +2955,7 @@ define <2 x double> @uitofp_load_2i64_to
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64:
@@ -2967,6 +2997,7 @@ define <2 x double> @uitofp_load_2i32_to
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_2i32_to_2f64:
@@ -2981,6 +3012,7 @@ define <2 x double> @uitofp_load_2i32_to
; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64:
@@ -3416,6 +3448,7 @@ define <4 x float> @sitofp_load_4i64_to_
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
@@ -3433,6 +3466,7 @@ define <4 x float> @sitofp_load_4i64_to_
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32:
@@ -3440,6 +3474,7 @@ define <4 x float> @sitofp_load_4i64_to_
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32:
@@ -4003,6 +4038,7 @@ define <4 x float> @uitofp_load_4i64_to_
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
@@ -4020,6 +4056,7 @@ define <4 x float> @uitofp_load_4i64_to_
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32:
@@ -4027,6 +4064,7 @@ define <4 x float> @uitofp_load_4i64_to_
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32:
@@ -4079,6 +4117,7 @@ define <4 x float> @uitofp_load_4i32_to_
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_4i32_to_4f32:
@@ -4091,6 +4130,7 @@ define <4 x float> @uitofp_load_4i32_to_
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32:
@@ -4810,6 +4850,7 @@ define void @aggregate_sitofp_8i16_to_8f
; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: vmovaps %ymm0, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = load %Arguments, %Arguments* %a0, align 1
%2 = extractvalue %Arguments %1, 1
Modified: llvm/trunk/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll Fri Mar 3 03:03:24 2017
@@ -77,6 +77,7 @@ define <4 x float> @test_uitofp_v4i32_to
; AVX512F-NEXT: # kill
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512F-NEXT: # kill
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32:
Modified: llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-compare-all_of.ll Fri Mar 3 03:03:24 2017
@@ -70,6 +70,7 @@ define i64 @test_v4f64_sext(<4 x double>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <4 x double> %a0, %a1
%s = sext <4 x i1> %c to <4 x i64>
@@ -120,6 +121,7 @@ define i64 @test_v4f64_legal_sext(<4 x d
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cltq
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <4 x double> %a0, %a1
%s = sext <4 x i1> %c to <4 x i32>
@@ -210,6 +212,7 @@ define i32 @test_v8f32_sext(<8 x float>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <8 x float> %a0, %a1
%s = sext <8 x i1> %c to <8 x i32>
@@ -261,6 +264,7 @@ define i32 @test_v8f32_legal_sext(<8 x f
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cwtl
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <8 x float> %a0, %a1
%s = sext <8 x i1> %c to <8 x i16>
@@ -356,6 +360,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <4 x i64> %a0, %a1
%s = sext <4 x i1> %c to <4 x i64>
@@ -422,6 +427,7 @@ define i64 @test_v4i64_legal_sext(<4 x i
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cltq
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <4 x i64> %a0, %a1
%s = sext <4 x i1> %c to <4 x i32>
@@ -527,6 +533,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <8 x i32> %a0, %a1
%s = sext <8 x i1> %c to <8 x i32>
@@ -593,6 +600,7 @@ define i32 @test_v8i32_legal_sext(<8 x i
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cwtl
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <8 x i32> %a0, %a1
%s = sext <8 x i1> %c to <8 x i16>
@@ -713,6 +721,7 @@ define i16 @test_v16i16_sext(<16 x i16>
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
%s = sext <16 x i1> %c to <16 x i16>
@@ -787,6 +796,7 @@ define i16 @test_v16i16_legal_sext(<16 x
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: movsbl %al, %eax
; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
%s = sext <16 x i1> %c to <16 x i8>
@@ -917,6 +927,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <32 x i8> %a0, %a1
%s = sext <32 x i1> %c to <32 x i8>
Modified: llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-compare-any_of.ll Fri Mar 3 03:03:24 2017
@@ -68,6 +68,7 @@ define i64 @test_v4f64_sext(<4 x double>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <4 x double> %a0, %a1
%s = sext <4 x i1> %c to <4 x i64>
@@ -114,6 +115,7 @@ define i64 @test_v4f64_legal_sext(<4 x d
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cltq
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <4 x double> %a0, %a1
%s = sext <4 x i1> %c to <4 x i32>
@@ -196,6 +198,7 @@ define i32 @test_v8f32_sext(<8 x float>
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <8 x float> %a0, %a1
%s = sext <8 x i1> %c to <8 x i32>
@@ -243,6 +246,7 @@ define i32 @test_v8f32_legal_sext(<8 x f
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cwtl
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = fcmp ogt <8 x float> %a0, %a1
%s = sext <8 x i1> %c to <8 x i16>
@@ -334,6 +338,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <4 x i64> %a0, %a1
%s = sext <4 x i1> %c to <4 x i64>
@@ -394,6 +399,7 @@ define i64 @test_v4i64_legal_sext(<4 x i
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cltq
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <4 x i64> %a0, %a1
%s = sext <4 x i1> %c to <4 x i32>
@@ -489,6 +495,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <8 x i32> %a0, %a1
%s = sext <8 x i1> %c to <8 x i32>
@@ -549,6 +556,7 @@ define i32 @test_v8i32_legal_sext(<8 x i
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cwtl
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <8 x i32> %a0, %a1
%s = sext <8 x i1> %c to <8 x i16>
@@ -662,6 +670,7 @@ define i16 @test_v16i16_sext(<16 x i16>
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
%s = sext <16 x i1> %c to <16 x i16>
@@ -730,6 +739,7 @@ define i16 @test_v16i16_legal_sext(<16 x
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: movsbl %al, %eax
; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
%s = sext <16 x i1> %c to <16 x i8>
@@ -853,6 +863,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <32 x i8> %a0, %a1
%s = sext <32 x i1> %c to <32 x i8>
Modified: llvm/trunk/test/CodeGen/X86/vector-compare-results.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-compare-results.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-compare-results.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-compare-results.ll Fri Mar 3 03:03:24 2017
@@ -146,6 +146,7 @@ define <4 x i1> @test_cmp_v4f64(<4 x dou
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = fcmp ogt <4 x double> %a0, %a1
ret <4 x i1> %1
@@ -181,6 +182,7 @@ define <8 x i1> @test_cmp_v8f32(<8 x flo
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = fcmp ogt <8 x float> %a0, %a1
ret <8 x i1> %1
@@ -243,6 +245,7 @@ define <4 x i1> @test_cmp_v4i64(<4 x i64
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp sgt <4 x i64> %a0, %a1
ret <4 x i1> %1
@@ -279,6 +282,7 @@ define <8 x i1> @test_cmp_v8i32(<8 x i32
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp sgt <8 x i32> %a0, %a1
ret <8 x i1> %1
@@ -315,6 +319,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16i16:
@@ -322,6 +327,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x
; AVX512DQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16i16:
@@ -329,6 +335,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x
; AVX512BW-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i16> %a0, %a1
ret <16 x i1> %1
@@ -610,6 +617,7 @@ define <8 x i1> @test_cmp_v8f64(<8 x dou
; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v8f64:
@@ -617,6 +625,7 @@ define <8 x i1> @test_cmp_v8f64(<8 x dou
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v8f64:
@@ -624,6 +633,7 @@ define <8 x i1> @test_cmp_v8f64(<8 x dou
; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = fcmp ogt <8 x double> %a0, %a1
ret <8 x i1> %1
@@ -670,6 +680,7 @@ define <16 x i1> @test_cmp_v16f32(<16 x
; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16f32:
@@ -677,6 +688,7 @@ define <16 x i1> @test_cmp_v16f32(<16 x
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16f32:
@@ -684,6 +696,7 @@ define <16 x i1> @test_cmp_v16f32(<16 x
; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = fcmp ogt <16 x float> %a0, %a1
ret <16 x i1> %1
@@ -781,6 +794,7 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v8i64:
@@ -788,6 +802,7 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64
; AVX512DQ-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v8i64:
@@ -795,6 +810,7 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64
; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <8 x i64> %a0, %a1
ret <8 x i1> %1
@@ -844,6 +860,7 @@ define <16 x i1> @test_cmp_v16i32(<16 x
; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16i32:
@@ -851,6 +868,7 @@ define <16 x i1> @test_cmp_v16i32(<16 x
; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16i32:
@@ -858,6 +876,7 @@ define <16 x i1> @test_cmp_v16i32(<16 x
; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i32> %a0, %a1
ret <16 x i1> %1
@@ -1962,6 +1981,7 @@ define <64 x i1> @test_cmp_v64i8(<64 x i
; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm3
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512F-NEXT: vmovdqa %xmm4, %xmm2
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v64i8:
@@ -1972,6 +1992,7 @@ define <64 x i1> @test_cmp_v64i8(<64 x i
; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm3
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm2
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v64i8:
@@ -2166,6 +2187,7 @@ define <16 x i1> @test_cmp_v16f64(<16 x
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16f64:
@@ -2281,6 +2303,7 @@ define <16 x i1> @test_cmp_v16f64(<16 x
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16f64:
@@ -2396,6 +2419,7 @@ define <16 x i1> @test_cmp_v16f64(<16 x
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = fcmp ogt <16 x double> %a0, %a1
ret <16 x i1> %1
@@ -3638,6 +3662,7 @@ define <16 x i1> @test_cmp_v16i64(<16 x
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16i64:
@@ -3769,6 +3794,7 @@ define <16 x i1> @test_cmp_v16i64(<16 x
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16i64:
@@ -3900,6 +3926,7 @@ define <16 x i1> @test_cmp_v16i64(<16 x
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i64> %a0, %a1
ret <16 x i1> %1
@@ -6029,6 +6056,7 @@ define <64 x i1> @test_cmp_v64i16(<64 x
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512F-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v64i16:
@@ -6315,6 +6343,7 @@ define <64 x i1> @test_cmp_v64i16(<64 x
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512DQ-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v64i16:
@@ -8382,6 +8411,7 @@ define <128 x i1> @test_cmp_v128i8(<128
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, (%rdi)
; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v128i8:
@@ -8427,6 +8457,7 @@ define <128 x i1> @test_cmp_v128i8(<128
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
; AVX512DQ-NEXT: movq %rdi, %rax
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v128i8:
Modified: llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll Fri Mar 3 03:03:24 2017
@@ -29,6 +29,7 @@ define float @cvt_i16_to_f32(i16 %a0) no
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_i16_to_f32:
@@ -122,6 +123,7 @@ define <4 x float> @cvt_4i16_to_4f32(<4
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4i16_to_4f32:
@@ -232,6 +234,7 @@ define <4 x float> @cvt_8i16_to_4f32(<8
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_4f32:
@@ -880,6 +883,7 @@ define float @load_cvt_i16_to_f32(i16* %
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_i16_to_f32:
@@ -950,6 +954,7 @@ define <4 x float> @load_cvt_4i16_to_4f3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_4i16_to_4f32:
@@ -1053,6 +1058,7 @@ define <4 x float> @load_cvt_8i16_to_4f3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
@@ -1534,6 +1540,7 @@ define double @cvt_i16_to_f64(i16 %a0) n
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_i16_to_f64:
@@ -1598,6 +1605,7 @@ define <2 x double> @cvt_2i16_to_2f64(<2
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_2i16_to_2f64:
@@ -1789,6 +1797,7 @@ define <2 x double> @cvt_8i16_to_2f64(<8
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_2f64:
@@ -2187,6 +2196,7 @@ define double @load_cvt_i16_to_f64(i16*
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_i16_to_f64:
@@ -2240,6 +2250,7 @@ define <2 x double> @load_cvt_2i16_to_2f
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_2i16_to_2f64:
@@ -2684,6 +2695,7 @@ define i16 @cvt_f32_to_i16(float %a0) no
; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_f32_to_i16:
@@ -2769,6 +2781,7 @@ define <4 x i16> @cvt_4f32_to_4i16(<4 x
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_4i16:
@@ -2874,6 +2887,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
@@ -2982,6 +2996,7 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
@@ -3157,6 +3172,7 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x
; AVX512F-NEXT: vmovq %rsi, %xmm0
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8f32_to_8i16:
@@ -3203,6 +3219,7 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x
; AVX512VL-NEXT: vmovq %rsi, %xmm0
; AVX512VL-NEXT: vmovq %rax, %xmm1
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
@@ -3509,6 +3526,7 @@ define void @store_cvt_f32_to_i16(float
; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_f32_to_i16:
@@ -3580,6 +3598,7 @@ define void @store_cvt_4f32_to_4i16(<4 x
; AVX512F-NEXT: movw %dx, 6(%rdi)
; AVX512F-NEXT: movw %cx, 4(%rdi)
; AVX512F-NEXT: movw %ax, 2(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_4i16:
@@ -3684,6 +3703,7 @@ define void @store_cvt_4f32_to_8i16_unde
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef:
@@ -3797,6 +3817,7 @@ define void @store_cvt_4f32_to_8i16_zero
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
@@ -3941,6 +3962,7 @@ define void @store_cvt_8f32_to_8i16(<8 x
; AVX512F-NEXT: movw %r10w, 6(%rdi)
; AVX512F-NEXT: movw %r9w, 4(%rdi)
; AVX512F-NEXT: movw %r8w, 2(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_8f32_to_8i16:
@@ -3976,6 +3998,7 @@ define void @store_cvt_8f32_to_8i16(<8 x
; AVX512VL-NEXT: movw %r10w, 6(%rdi)
; AVX512VL-NEXT: movw %r9w, 4(%rdi)
; AVX512VL-NEXT: movw %r8w, 2(%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
@@ -4183,6 +4206,7 @@ define void @store_cvt_16f32_to_16i16(<1
; AVX512F-NEXT: movw %ax, 4(%rdi)
; AVX512F-NEXT: vmovd %xmm4, %eax
; AVX512F-NEXT: movw %ax, 2(%rdi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
@@ -4250,6 +4274,7 @@ define void @store_cvt_16f32_to_16i16(<1
; AVX512VL-NEXT: movw %ax, 4(%rdi)
; AVX512VL-NEXT: vmovd %xmm4, %eax
; AVX512VL-NEXT: movw %ax, 2(%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
@@ -4375,11 +4400,13 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x
; AVX512F-NEXT: subq $40, %rsp
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r14d
; AVX512F-NEXT: orl %ebx, %r14d
@@ -4387,6 +4414,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
@@ -4409,11 +4437,13 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x
; AVX512VL-NEXT: subq $40, %rsp
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
@@ -4421,6 +4451,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
@@ -4524,11 +4555,13 @@ define <8 x i16> @cvt_4f64_to_8i16_undef
; AVX512F-NEXT: subq $40, %rsp
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r14d
; AVX512F-NEXT: orl %ebx, %r14d
@@ -4536,6 +4569,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
@@ -4559,11 +4593,13 @@ define <8 x i16> @cvt_4f64_to_8i16_undef
; AVX512VL-NEXT: subq $40, %rsp
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
@@ -4571,6 +4607,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
@@ -4677,11 +4714,13 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(
; AVX512F-NEXT: subq $40, %rsp
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r14d
; AVX512F-NEXT: orl %ebx, %r14d
@@ -4689,6 +4728,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
@@ -4712,11 +4752,13 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(
; AVX512VL-NEXT: subq $40, %rsp
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
@@ -4724,6 +4766,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
@@ -4891,11 +4934,13 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x
; AVX512F-NEXT: subq $96, %rsp
; AVX512F-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r15d
; AVX512F-NEXT: orl %ebx, %r15d
@@ -4903,6 +4948,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
@@ -4916,11 +4962,13 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x
; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r15d
; AVX512F-NEXT: orl %ebx, %r15d
@@ -4928,6 +4976,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bx
; AVX512F-NEXT: shll $16, %ebx
@@ -4954,11 +5003,13 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x
; AVX512VL-NEXT: subq $96, %rsp
; AVX512VL-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r15d
; AVX512VL-NEXT: orl %ebx, %r15d
@@ -4966,6 +5017,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
@@ -4979,11 +5031,13 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x
; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r15d
; AVX512VL-NEXT: orl %ebx, %r15d
@@ -4991,6 +5045,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bx
; AVX512VL-NEXT: shll $16, %ebx
@@ -5144,16 +5199,19 @@ define void @store_cvt_4f64_to_4i16(<4 x
; AVX512F-NEXT: movq %rdi, %rbx
; AVX512F-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %r14d
; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %r15d
; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
@@ -5179,16 +5237,19 @@ define void @store_cvt_4f64_to_4i16(<4 x
; AVX512VL-NEXT: movq %rdi, %rbx
; AVX512VL-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r14d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r15d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
@@ -5303,11 +5364,13 @@ define void @store_cvt_4f64_to_8i16_unde
; AVX512F-NEXT: movq %rdi, %r14
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %ebx
; AVX512F-NEXT: orl %ebp, %ebx
@@ -5315,6 +5378,7 @@ define void @store_cvt_4f64_to_8i16_unde
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bp
; AVX512F-NEXT: shll $16, %ebp
@@ -5342,11 +5406,13 @@ define void @store_cvt_4f64_to_8i16_unde
; AVX512VL-NEXT: movq %rdi, %r14
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %ebx
; AVX512VL-NEXT: orl %ebp, %ebx
@@ -5354,6 +5420,7 @@ define void @store_cvt_4f64_to_8i16_unde
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bp
; AVX512VL-NEXT: shll $16, %ebp
@@ -5473,11 +5540,13 @@ define void @store_cvt_4f64_to_8i16_zero
; AVX512F-NEXT: movq %rdi, %r14
; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %ebx
; AVX512F-NEXT: orl %ebp, %ebx
@@ -5485,6 +5554,7 @@ define void @store_cvt_4f64_to_8i16_zero
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, %bp
; AVX512F-NEXT: shll $16, %ebp
@@ -5512,11 +5582,13 @@ define void @store_cvt_4f64_to_8i16_zero
; AVX512VL-NEXT: movq %rdi, %r14
; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %ebx
; AVX512VL-NEXT: orl %ebp, %ebx
@@ -5524,6 +5596,7 @@ define void @store_cvt_4f64_to_8i16_zero
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, %bp
; AVX512VL-NEXT: shll $16, %ebp
@@ -5700,28 +5773,33 @@ define void @store_cvt_8f64_to_8i16(<8 x
; AVX512F-NEXT: movq %rdi, %rbx
; AVX512F-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %r12d
; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %r13d
; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
@@ -5729,6 +5807,7 @@ define void @store_cvt_8f64_to_8i16(<8 x
; AVX512F-NEXT: movl %eax, %r14d
; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movl %eax, %r15d
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
@@ -5764,28 +5843,33 @@ define void @store_cvt_8f64_to_8i16(<8 x
; AVX512VL-NEXT: movq %rdi, %rbx
; AVX512VL-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
; AVX512VL-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512VL-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r12d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r13d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
@@ -5793,6 +5877,7 @@ define void @store_cvt_8f64_to_8i16(<8 x
; AVX512VL-NEXT: movl %eax, %r14d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r15d
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
Modified: llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-rotate-128.ll Fri Mar 3 03:03:24 2017
@@ -461,6 +461,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v8i16:
@@ -702,6 +703,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x
; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v16i8:
@@ -716,6 +718,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x
; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; XOP-LABEL: var_rotate_v16i8:
@@ -1085,6 +1088,7 @@ define <8 x i16> @constant_rotate_v8i16(
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v8i16:
@@ -1272,6 +1276,7 @@ define <16 x i8> @constant_rotate_v16i8(
; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; XOP-LABEL: constant_rotate_v16i8:
Modified: llvm/trunk/test/CodeGen/X86/vector-sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-sext.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-sext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-sext.ll Fri Mar 3 03:03:24 2017
@@ -1246,6 +1246,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_2i1_to_2i64:
@@ -1254,6 +1255,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
@@ -1436,6 +1438,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_4i1_to_4i32:
@@ -1445,6 +1448,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
@@ -1941,6 +1945,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_8i1_to_8i16:
@@ -1949,6 +1954,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
@@ -2852,6 +2858,7 @@ define <16 x i8> @load_sext_16i1_to_16i8
; AVX512-NEXT: kmovw (%rdi), %k1
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-masked.ll Fri Mar 3 03:03:24 2017
@@ -243,6 +243,7 @@ define <4 x i32> @mask_extract_v16i32_v4
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti32x4 $0, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -257,6 +258,7 @@ define <4 x i32> @mask_extract_v16i32_v4
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -271,6 +273,7 @@ define <4 x i32> @mask_extract_v16i32_v4
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -285,6 +288,7 @@ define <4 x i32> @mask_extract_v16i32_v4
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -299,6 +303,7 @@ define <4 x float> @mask_extract_v16f32_
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf32x4 $0, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -313,6 +318,7 @@ define <4 x float> @mask_extract_v16f32_
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -327,6 +333,7 @@ define <4 x float> @mask_extract_v16f32_
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -341,6 +348,7 @@ define <4 x float> @mask_extract_v16f32_
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -407,6 +415,7 @@ define <2 x i64> @mask_extract_v8i64_v2i
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti64x2 $0, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -421,6 +430,7 @@ define <2 x i64> @mask_extract_v8i64_v2i
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -435,6 +445,7 @@ define <2 x i64> @mask_extract_v8i64_v2i
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti64x2 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -449,6 +460,7 @@ define <2 x i64> @mask_extract_v8i64_v2i
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti64x2 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -463,6 +475,7 @@ define <2 x double> @mask_extract_v8f64_
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf64x2 $0, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -477,6 +490,7 @@ define <2 x double> @mask_extract_v8f64_
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -491,6 +505,7 @@ define <2 x double> @mask_extract_v8f64_
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf64x2 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 4, i32 5>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -505,6 +520,7 @@ define <2 x double> @mask_extract_v8f64_
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf64x2 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 6, i32 7>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -603,6 +619,7 @@ define <4 x i32> @mask_cast_extract_v8i6
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
%shuffle.cast = bitcast <2 x i64> %shuffle to <4 x i32>
@@ -618,6 +635,7 @@ define <4 x float> @mask_cast_extract_v8
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
%shuffle.cast = bitcast <2 x double> %shuffle to <4 x float>
@@ -663,6 +681,7 @@ define <2 x i64> @mask_cast_extract_v16i
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.cast = bitcast <4 x i32> %shuffle to <2 x i64>
@@ -678,6 +697,7 @@ define <2 x double> @mask_cast_extract_v
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.cast = bitcast <4 x float> %shuffle to <2 x double>
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll Fri Mar 3 03:03:24 2017
@@ -78,6 +78,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
@@ -88,6 +89,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0
; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%a2 = icmp eq <8 x i64> %a, %a1
%b2 = icmp eq <8 x i64> %b, %b1
@@ -108,6 +110,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
@@ -120,6 +123,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7
; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0
; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%a2 = icmp eq <16 x i32> %a, %a1
%b2 = icmp eq <16 x i32> %b, %b1
@@ -162,6 +166,7 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
@@ -172,6 +177,7 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u
; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
@@ -189,6 +195,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
@@ -200,6 +207,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
@@ -216,6 +224,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
@@ -225,6 +234,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a
; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -243,6 +253,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
@@ -254,6 +265,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
@@ -272,6 +284,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
@@ -283,6 +296,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
@@ -303,6 +317,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
@@ -316,6 +331,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
@@ -336,6 +352,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
@@ -348,6 +365,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
%c1 = bitcast <8 x i1>%c to i8
@@ -364,6 +382,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
@@ -373,6 +392,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0
; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovw %k0, %eax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i16 %a to <16 x i1>
%c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
@@ -413,6 +433,7 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf64i1_zero:
@@ -422,6 +443,7 @@ define i64 @shuf64i1_zero(i64 %a) {
; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovq %k0, %rax
+; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i64 %a to <64 x i1>
%c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll Fri Mar 3 03:03:24 2017
@@ -42,6 +42,7 @@ define <4 x i32> @trunc_add_v4i64_v4i32(
; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -109,6 +110,7 @@ define <8 x i16> @trunc_add_v8i64_v8i16(
; AVX512: # BB#0:
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -154,6 +156,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16(
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -262,6 +265,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_v16i64_v16i8:
@@ -272,6 +276,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_v16i64_v16i8:
@@ -282,6 +287,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = add <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -346,6 +352,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8
; AVX512: # BB#0:
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -392,6 +399,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8
; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
@@ -399,6 +407,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8
; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
@@ -406,6 +415,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8
; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = add <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -452,6 +462,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16_
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = sext <8 x i8> %1 to <8 x i32>
@@ -492,6 +503,7 @@ define <4 x i32> @trunc_add_const_v4i64_
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -549,6 +561,7 @@ define <8 x i16> @trunc_add_const_v8i64_
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -590,6 +603,7 @@ define <8 x i16> @trunc_add_const_v8i32_
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -676,6 +690,7 @@ define <16 x i8> @trunc_add_const_v16i64
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8:
@@ -685,6 +700,7 @@ define <16 x i8> @trunc_add_const_v16i64
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8:
@@ -694,6 +710,7 @@ define <16 x i8> @trunc_add_const_v16i64
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -749,6 +766,7 @@ define <16 x i8> @trunc_add_const_v16i32
; AVX512: # BB#0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -792,6 +810,7 @@ define <16 x i8> @trunc_add_const_v16i16
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
@@ -799,6 +818,7 @@ define <16 x i8> @trunc_add_const_v16i16
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
@@ -806,6 +826,7 @@ define <16 x i8> @trunc_add_const_v16i16
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -848,6 +869,7 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(
; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -915,6 +937,7 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(
; AVX512: # BB#0:
; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -960,6 +983,7 @@ define <8 x i16> @trunc_sub_v8i32_v8i16(
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -1068,6 +1092,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_v16i64_v16i8:
@@ -1078,6 +1103,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_v16i64_v16i8:
@@ -1088,6 +1114,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = sub <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -1152,6 +1179,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8
; AVX512: # BB#0:
; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -1198,6 +1226,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8
; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
@@ -1205,6 +1234,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8
; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
@@ -1212,6 +1242,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8
; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = sub <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -1259,6 +1290,7 @@ define <4 x i32> @trunc_sub_const_v4i64_
; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -1330,6 +1362,7 @@ define <8 x i16> @trunc_sub_const_v8i64_
; AVX512: # BB#0:
; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -1374,6 +1407,7 @@ define <8 x i16> @trunc_sub_const_v8i32_
; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -1484,6 +1518,7 @@ define <16 x i8> @trunc_sub_const_v16i64
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_const_v16i64_v16i8:
@@ -1494,6 +1529,7 @@ define <16 x i8> @trunc_sub_const_v16i64
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_const_v16i64_v16i8:
@@ -1504,6 +1540,7 @@ define <16 x i8> @trunc_sub_const_v16i64
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -1566,6 +1603,7 @@ define <16 x i8> @trunc_sub_const_v16i32
; AVX512: # BB#0:
; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -1611,6 +1649,7 @@ define <16 x i8> @trunc_sub_const_v16i16
; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
@@ -1618,6 +1657,7 @@ define <16 x i8> @trunc_sub_const_v16i16
; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
@@ -1625,6 +1665,7 @@ define <16 x i8> @trunc_sub_const_v16i16
; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -1688,6 +1729,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
@@ -1697,6 +1739,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
@@ -1706,6 +1749,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -1792,6 +1836,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
@@ -1799,12 +1844,14 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(
; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -1862,6 +1909,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -2116,6 +2164,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8
; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
@@ -2128,6 +2177,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8
; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
@@ -2138,6 +2188,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -2226,6 +2277,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8
; AVX512: # BB#0:
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -2272,6 +2324,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8
; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
@@ -2279,6 +2332,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8
; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
@@ -2286,6 +2340,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -2332,6 +2387,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = zext <8 x i8> %1 to <8 x i32>
@@ -2387,6 +2443,7 @@ define <4 x i32> @trunc_mul_const_v4i64_
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -2444,6 +2501,7 @@ define <8 x i16> @trunc_mul_const_v8i64_
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -2485,6 +2543,7 @@ define <8 x i16> @trunc_mul_const_v8i32_
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -2681,6 +2740,7 @@ define <16 x i8> @trunc_mul_const_v16i64
; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
@@ -2691,6 +2751,7 @@ define <16 x i8> @trunc_mul_const_v16i64
; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
@@ -2701,6 +2762,7 @@ define <16 x i8> @trunc_mul_const_v16i64
; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -2791,6 +2853,7 @@ define <16 x i8> @trunc_mul_const_v16i32
; AVX512: # BB#0:
; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -2836,6 +2899,7 @@ define <16 x i8> @trunc_mul_const_v16i16
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
@@ -2843,6 +2907,7 @@ define <16 x i8> @trunc_mul_const_v16i16
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
@@ -2850,6 +2915,7 @@ define <16 x i8> @trunc_mul_const_v16i16
; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -2890,6 +2956,7 @@ define <4 x i32> @trunc_and_v4i64_v4i32(
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -2953,6 +3020,7 @@ define <8 x i16> @trunc_and_v8i64_v8i16(
; AVX512: # BB#0:
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -2996,6 +3064,7 @@ define <8 x i16> @trunc_and_v8i32_v8i16(
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -3096,6 +3165,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_v16i64_v16i8:
@@ -3106,6 +3176,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_v16i64_v16i8:
@@ -3116,6 +3187,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = and <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -3176,6 +3248,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8
; AVX512: # BB#0:
; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -3220,6 +3293,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
@@ -3227,6 +3301,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8
; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
@@ -3234,6 +3309,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = and <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -3272,6 +3348,7 @@ define <4 x i32> @trunc_and_const_v4i64_
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -3329,6 +3406,7 @@ define <8 x i16> @trunc_and_const_v8i64_
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -3370,6 +3448,7 @@ define <8 x i16> @trunc_and_const_v8i32_
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -3456,6 +3535,7 @@ define <16 x i8> @trunc_and_const_v16i64
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8:
@@ -3465,6 +3545,7 @@ define <16 x i8> @trunc_and_const_v16i64
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8:
@@ -3474,6 +3555,7 @@ define <16 x i8> @trunc_and_const_v16i64
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -3529,6 +3611,7 @@ define <16 x i8> @trunc_and_const_v16i32
; AVX512: # BB#0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -3572,6 +3655,7 @@ define <16 x i8> @trunc_and_const_v16i16
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
@@ -3579,6 +3663,7 @@ define <16 x i8> @trunc_and_const_v16i16
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
@@ -3586,6 +3671,7 @@ define <16 x i8> @trunc_and_const_v16i16
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -3626,6 +3712,7 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -3689,6 +3776,7 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(
; AVX512: # BB#0:
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -3732,6 +3820,7 @@ define <8 x i16> @trunc_xor_v8i32_v8i16(
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -3832,6 +3921,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_v16i64_v16i8:
@@ -3842,6 +3932,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_v16i64_v16i8:
@@ -3852,6 +3943,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = xor <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -3912,6 +4004,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8
; AVX512: # BB#0:
; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -3956,6 +4049,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8
; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
@@ -3963,6 +4057,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8
; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
@@ -3970,6 +4065,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8
; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = xor <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -4008,6 +4104,7 @@ define <4 x i32> @trunc_xor_const_v4i64_
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -4065,6 +4162,7 @@ define <8 x i16> @trunc_xor_const_v8i64_
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -4106,6 +4204,7 @@ define <8 x i16> @trunc_xor_const_v8i32_
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -4192,6 +4291,7 @@ define <16 x i8> @trunc_xor_const_v16i64
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8:
@@ -4201,6 +4301,7 @@ define <16 x i8> @trunc_xor_const_v16i64
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8:
@@ -4210,6 +4311,7 @@ define <16 x i8> @trunc_xor_const_v16i64
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -4265,6 +4367,7 @@ define <16 x i8> @trunc_xor_const_v16i32
; AVX512: # BB#0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -4308,6 +4411,7 @@ define <16 x i8> @trunc_xor_const_v16i16
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
@@ -4315,6 +4419,7 @@ define <16 x i8> @trunc_xor_const_v16i16
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
@@ -4322,6 +4427,7 @@ define <16 x i8> @trunc_xor_const_v16i16
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -4362,6 +4468,7 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <4 x i64> %a0, %a1
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -4425,6 +4532,7 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<
; AVX512: # BB#0:
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <8 x i64> %a0, %a1
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -4468,6 +4576,7 @@ define <8 x i16> @trunc_or_v8i32_v8i16(<
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <8 x i32> %a0, %a1
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -4568,6 +4677,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_v16i64_v16i8:
@@ -4578,6 +4688,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_v16i64_v16i8:
@@ -4588,6 +4699,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = or <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -4648,6 +4760,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(
; AVX512: # BB#0:
; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <16 x i32> %a0, %a1
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -4692,6 +4805,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
@@ -4699,6 +4813,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
@@ -4706,6 +4821,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(
; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = or <16 x i16> %a0, %a1
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -4744,6 +4860,7 @@ define <4 x i32> @trunc_or_const_v4i64_v
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -4801,6 +4918,7 @@ define <8 x i16> @trunc_or_const_v8i64_v
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -4842,6 +4960,7 @@ define <8 x i16> @trunc_or_const_v8i32_v
; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -4928,6 +5047,7 @@ define <16 x i8> @trunc_or_const_v16i64_
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8:
@@ -4937,6 +5057,7 @@ define <16 x i8> @trunc_or_const_v16i64_
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8:
@@ -4946,6 +5067,7 @@ define <16 x i8> @trunc_or_const_v16i64_
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -5001,6 +5123,7 @@ define <16 x i8> @trunc_or_const_v16i32_
; AVX512: # BB#0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -5044,6 +5167,7 @@ define <16 x i8> @trunc_or_const_v16i16_
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
@@ -5051,6 +5175,7 @@ define <16 x i8> @trunc_or_const_v16i16_
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
@@ -5058,6 +5183,7 @@ define <16 x i8> @trunc_or_const_v16i16_
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc.ll Fri Mar 3 03:03:24 2017
@@ -120,6 +120,7 @@ define <8 x i16> @trunc8i64_8i16(<8 x i6
; AVX512-LABEL: trunc8i64_8i16:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%0 = trunc <8 x i64> %a to <8 x i16>
@@ -175,6 +176,7 @@ define void @trunc8i64_8i8(<8 x i64> %a)
; AVX512-LABEL: trunc8i64_8i8:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vpmovqb %zmm0, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%0 = trunc <8 x i64> %a to <8 x i8>
@@ -231,11 +233,13 @@ define <8 x i16> @trunc8i32_8i16(<8 x i3
; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc8i32_8i16:
; AVX512VL: # BB#0: # %entry
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i16:
@@ -243,11 +247,13 @@ define <8 x i16> @trunc8i32_8i16(<8 x i3
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc8i32_8i16:
; AVX512BWVL: # BB#0: # %entry
; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <8 x i32> %a to <8 x i16>
@@ -309,11 +315,13 @@ define void @trunc8i32_8i8(<8 x i32> %a)
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc8i32_8i8:
; AVX512VL: # BB#0: # %entry
; AVX512VL-NEXT: vpmovdb %ymm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i8:
@@ -322,11 +330,13 @@ define void @trunc8i32_8i8(<8 x i32> %a)
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc8i32_8i8:
; AVX512BWVL: # BB#0: # %entry
; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <8 x i32> %a to <8 x i8>
@@ -411,6 +421,7 @@ define void @trunc16i32_16i16(<16 x i32>
; AVX512-LABEL: trunc16i32_16i16:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vpmovdw %zmm0, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%0 = trunc <16 x i32> %a to <16 x i16>
@@ -466,6 +477,7 @@ define void @trunc16i32_16i8(<16 x i32>
; AVX512-LABEL: trunc16i32_16i8:
; AVX512: # BB#0: # %entry
; AVX512-NEXT: vpmovdb %zmm0, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%0 = trunc <16 x i32> %a to <16 x i8>
@@ -529,6 +541,7 @@ define void @trunc16i16_16i8(<16 x i16>
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc16i16_16i8:
@@ -536,6 +549,7 @@ define void @trunc16i16_16i8(<16 x i16>
; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc16i16_16i8:
@@ -543,11 +557,13 @@ define void @trunc16i16_16i8(<16 x i16>
; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc16i16_16i8:
; AVX512BWVL: # BB#0: # %entry
; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <16 x i16> %a to <16 x i8>
@@ -635,6 +651,7 @@ define void @trunc32i16_32i8(<32 x i16>
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc32i16_32i8:
@@ -645,16 +662,19 @@ define void @trunc32i16_32i8(<32 x i16>
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc32i16_32i8:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: vpmovwb %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc32i16_32i8:
; AVX512BWVL: # BB#0: # %entry
; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <32 x i16> %a to <32 x i8>
@@ -810,6 +830,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2x4i64_8i16:
@@ -823,6 +844,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x
; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i64_8i16:
@@ -835,6 +857,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x4i64_8i16:
@@ -848,6 +871,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x
; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <4 x i64> %a to <4 x i16>
Modified: llvm/trunk/test/CodeGen/X86/vector-tzcnt-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-tzcnt-128.ll?rev=296859&r1=296858&r2=296859&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-tzcnt-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-tzcnt-128.ll Fri Mar 3 03:03:24 2017
@@ -281,6 +281,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63]
; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv2i64u:
@@ -696,6 +697,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv4i32u:
More information about the llvm-commits
mailing list