[clang] b2b6a54 - [X86] Add support for -mvzeroupper and -mno-vzeroupper to match gcc
Craig Topper via cfe-commits
cfe-commits at lists.llvm.org
Mon Nov 4 11:04:14 PST 2019
Author: Craig Topper
Date: 2019-11-04T11:03:54-08:00
New Revision: b2b6a54f847f33f821f41e3e82bf3b86e08817a0
URL: https://github.com/llvm/llvm-project/commit/b2b6a54f847f33f821f41e3e82bf3b86e08817a0
DIFF: https://github.com/llvm/llvm-project/commit/b2b6a54f847f33f821f41e3e82bf3b86e08817a0.diff
LOG: [X86] Add support for -mvzeroupper and -mno-vzeroupper to match gcc
-mvzeroupper will force the vzeroupper insertion pass to run on
CPUs that normally wouldn't. -mno-vzeroupper disables it on CPUs
where it normally runs.
To support this with the default feature handling in clang, we
need a vzeroupper feature flag in X86.td. Since this flag has
the opposite polarity of the fast-partial-ymm-or-zmm-write we
used to use to disable the pass, we now need to add this new
flag to every CPU except KNL/KNM and BTVER2 to keep identical
behavior.
Remove -fast-partial-ymm-or-zmm-write which is no longer used.
Differential Revision: https://reviews.llvm.org/D69786
Added:
Modified:
clang/docs/ReleaseNotes.rst
clang/include/clang/Driver/Options.td
clang/test/Driver/x86-target-features.c
llvm/docs/ReleaseNotes.rst
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86Subtarget.h
llvm/lib/Target/X86/X86TargetTransformInfo.h
llvm/lib/Target/X86/X86VZeroUpper.cpp
llvm/test/CodeGen/X86/avx-vzeroupper.ll
Removed:
################################################################################
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 298d793a0de4..5363050ca69d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -93,6 +93,10 @@ New Compiler Flags
Clang. Setting the version to zero causes Clang to leave ``__GNUC__`` and
other GNU-namespaced macros, such as ``__GXX_WEAK__``, undefined.
+- vzeroupper insertion on X86 targets can now be disabled with -mno-vzeroupper.
+ You can also force vzeroupper insertion to be used on CPUs that normally
+ wouldn't with -mvzeroupper.
+
Deprecated Compiler Flags
-------------------------
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 47e8bbf3eb12..d75a0f601d7e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3135,6 +3135,8 @@ def mshstk : Flag<["-"], "mshstk">, Group<m_x86_Features_Group>;
def mno_shstk : Flag<["-"], "mno-shstk">, Group<m_x86_Features_Group>;
def mretpoline_external_thunk : Flag<["-"], "mretpoline-external-thunk">, Group<m_x86_Features_Group>;
def mno_retpoline_external_thunk : Flag<["-"], "mno-retpoline-external-thunk">, Group<m_x86_Features_Group>;
+def mvzeroupper : Flag<["-"], "mvzeroupper">, Group<m_x86_Features_Group>;
+def mno_vzeroupper : Flag<["-"], "mno-vzeroupper">, Group<m_x86_Features_Group>;
// These are legacy user-facing driver-level option spellings. They are always
// aliases for options that are spelled using the more common Unix / GNU flag
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 29d8f59ab872..9a406b504b24 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -193,3 +193,8 @@
// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-enqcmd %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-ENQCMD %s
// ENQCMD: "-target-feature" "+enqcmd"
// NO-ENQCMD: "-target-feature" "-enqcmd"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mvzeroupper %s -### -o %t.o 2>&1 | FileCheck --check-prefix=VZEROUPPER %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-vzeroupper %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-VZEROUPPER %s
+// VZEROUPPER: "-target-feature" "+vzeroupper"
+// NO-VZEROUPPER: "-target-feature" "-vzeroupper"
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 74fe35c89874..1510ca5a5e2d 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -134,6 +134,13 @@ Changes to the X86 Target
Intel CPUs. This tries to limit the use of 512-bit registers which can cause a
decrease in CPU frequency on these CPUs. This can be re-enabled by passing
-mprefer-vector-width=512 to clang or passing -mattr=-prefer-256-bit to llc.
+* Deprecated the mpx feature flag for the Intel MPX instructions. There were no
+ intrinsics for this feature. This change only this effects the results
+ returned by getHostCPUFeatures on CPUs that implement the MPX instructions.
+* The feature flag fast-partial-ymm-or-zmm-write which previously disabled
+ vzeroupper insertion has been removed. It has been replaced with a vzeroupper
+ feature flag which has the opposite polarity. So -vzeroupper has the same
+ effect as +fast-partial-ymm-or-zmm-write.
Changes to the AMDGPU Target
-----------------------------
@@ -143,10 +150,6 @@ Changes to the AVR Target
During this release ...
-* Deprecated the mpx feature flag for the Intel MPX instructions. There were no
- intrinsics for this feature. This change only this effects the results
- returned by getHostCPUFeatures on CPUs that implement the MPX instructions.
-
Changes to the WebAssembly Target
---------------------------------
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 44be46ef41c4..6d6127dcb376 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -304,12 +304,12 @@ def FeatureFastVariableShuffle
: SubtargetFeature<"fast-variable-shuffle",
"HasFastVariableShuffle",
"true", "Shuffles with variable masks are fast">;
-// On some X86 processors, there is no performance hazard to writing only the
-// lower parts of a YMM or ZMM register without clearing the upper part.
-def FeatureFastPartialYMMorZMMWrite
- : SubtargetFeature<"fast-partial-ymm-or-zmm-write",
- "HasFastPartialYMMorZMMWrite",
- "true", "Partial writes to YMM/ZMM registers are fast">;
+// On some X86 processors, a vzeroupper instruction should be inserted after
+// using ymm/zmm registers before executing code that may use SSE instructions.
+def FeatureInsertVZEROUPPER
+ : SubtargetFeature<"vzeroupper",
+ "InsertVZEROUPPER",
+ "true", "Should insert vzeroupper instructions">;
// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
// vector FSQRT has higher throughput than the corresponding NR code.
@@ -525,7 +525,8 @@ def ProcessorFeatures {
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureLAHFSAHF,
- FeatureMacroFusion];
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> NHMSpecificFeatures = [];
list<SubtargetFeature> NHMFeatures =
!listconcat(NHMInheritableFeatures, NHMSpecificFeatures);
@@ -705,7 +706,8 @@ def ProcessorFeatures {
FeatureCMPXCHG16B,
FeatureMOVBE,
FeatureSlowTwoMemOps,
- FeatureLAHFSAHF];
+ FeatureLAHFSAHF,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom,
FeatureSlowUAMem16,
FeatureLEAForSP,
@@ -807,7 +809,6 @@ def ProcessorFeatures {
FeaturePRFCHW,
FeaturePreferMaskRegisters,
FeatureSlowTwoMemOps,
- FeatureFastPartialYMMorZMMWrite,
FeatureHasFastGather,
FeatureSlowPMADDWD];
// TODO Add AVX5124FMAPS/AVX5124VNNIW features
@@ -828,7 +829,8 @@ def ProcessorFeatures {
FeatureLAHFSAHF,
FeatureCMOV,
Feature64Bit,
- FeatureFastScalarShiftMasks];
+ FeatureFastScalarShiftMasks,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures;
// Bobcat
@@ -850,7 +852,9 @@ def ProcessorFeatures {
FeatureFast15ByteNOP,
FeatureFastScalarShiftMasks,
FeatureFastVectorShiftMasks];
- list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;
+ list<SubtargetFeature> BtVer1SpecificFeatures = [FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> BtVer1Features =
+ !listconcat(BtVer1InheritableFeatures, BtVer1SpecificFeatures);
// Jaguar
list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
@@ -863,7 +867,6 @@ def ProcessorFeatures {
FeatureXSAVEOPT];
list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT,
FeatureFastBEXTR,
- FeatureFastPartialYMMorZMMWrite,
FeatureFastHorizontalOps];
list<SubtargetFeature> BtVer2InheritableFeatures =
!listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures);
@@ -891,7 +894,8 @@ def ProcessorFeatures {
FeatureLAHFSAHF,
FeatureFast11ByteNOP,
FeatureFastScalarShiftMasks,
- FeatureBranchFusion];
+ FeatureBranchFusion,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;
// PileDriver
@@ -954,6 +958,7 @@ def ProcessorFeatures {
FeatureSHA,
FeatureSSE4A,
FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER,
FeatureX87,
FeatureXSAVE,
FeatureXSAVEC,
@@ -976,28 +981,32 @@ class Proc<string Name, list<SubtargetFeature> Features>
// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled
// if i386/i486 is specifically requested.
def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B]>;
-def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>;
-def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>;
+ FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
+def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER]>;
+def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER]>;
def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B]>;
+ FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B]>;
+ FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B, FeatureMMX]>;
+ FeatureCMPXCHG8B, FeatureMMX,
+ FeatureInsertVZEROUPPER]>;
def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureCMOV, FeatureNOPL]>;
+ FeatureCMOV, FeatureNOPL, FeatureInsertVZEROUPPER]>;
def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureCMOV, FeatureFXSR,
- FeatureNOPL]>;
+ FeatureNOPL, FeatureInsertVZEROUPPER]>;
foreach P = ["pentium3", "pentium3m"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX,
- FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV,
+ FeatureInsertVZEROUPPER]>;
}
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@@ -1013,29 +1022,29 @@ foreach P = ["pentium3", "pentium3m"] in {
def : ProcessorModel<"pentium-m", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
foreach P = ["pentium4", "pentium4m"] in {
def : ProcessorModel<P, GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
}
// Intel Quark.
-def : Proc<"lakemont", []>;
+def : Proc<"lakemont", [FeatureInsertVZEROUPPER]>;
// Intel Core Duo.
def : ProcessorModel<"yonah", SandyBridgeModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
// NetBurst.
def : ProcessorModel<"prescott", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureX87,
FeatureSlowUAMem16,
@@ -1046,7 +1055,8 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureFXSR,
FeatureNOPL,
Feature64Bit,
- FeatureCMPXCHG16B
+ FeatureCMPXCHG16B,
+ FeatureInsertVZEROUPPER
]>;
// Intel Core 2 Solo/Duo.
@@ -1062,7 +1072,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
Feature64Bit,
FeatureCMPXCHG16B,
FeatureLAHFSAHF,
- FeatureMacroFusion
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER
]>;
def : ProcessorModel<"penryn", SandyBridgeModel, [
FeatureX87,
@@ -1076,7 +1087,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
Feature64Bit,
FeatureCMPXCHG16B,
FeatureLAHFSAHF,
- FeatureMacroFusion
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER
]>;
// Atom CPUs.
@@ -1143,35 +1155,36 @@ def : ProcessorModel<"tigerlake", SkylakeServerModel,
// AMD CPUs.
def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureMMX]>;
+ FeatureMMX, FeatureInsertVZEROUPPER]>;
def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- Feature3DNow]>;
+ Feature3DNow, FeatureInsertVZEROUPPER]>;
def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- Feature3DNow]>;
+ Feature3DNow, FeatureInsertVZEROUPPER]>;
foreach P = ["athlon", "athlon-tbird"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
- Feature3DNowA, FeatureNOPL, FeatureSlowSHLD]>;
+ Feature3DNowA, FeatureNOPL, FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER]>;
}
foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL,
- FeatureSlowSHLD]>;
+ FeatureSlowSHLD, FeatureInsertVZEROUPPER]>;
}
foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL,
Feature64Bit, FeatureSlowSHLD, FeatureCMOV,
- FeatureFastScalarShiftMasks]>;
+ FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3,
Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B,
FeatureSlowSHLD, FeatureCMOV, Feature64Bit,
- FeatureFastScalarShiftMasks]>;
+ FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
}
foreach P = ["amdfam10", "barcelona"] in {
@@ -1196,14 +1209,17 @@ def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>;
def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- Feature3DNowA]>;
-
-def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
-def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+ Feature3DNowA, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+ FeatureInsertVZEROUPPER]>;
+def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
+ FeatureInsertVZEROUPPER]>;
+def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
+ FeatureInsertVZEROUPPER]>;
def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE1, FeatureFXSR,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
// We also provide a generic 64-bit specific x86 processor model which tries to
// be good for modern chips without enabling instruction set encodings past the
@@ -1226,7 +1242,8 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
Feature64Bit,
FeatureSlow3OpsLEA,
FeatureSlowIncDec,
- FeatureMacroFusion
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER
]>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 14f831045b38..c4770d7cf580 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -256,9 +256,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// mask over multiple fixed shuffles.
bool HasFastVariableShuffle = false;
- /// True if there is no performance penalty to writing only the lower parts
- /// of a YMM or ZMM register without clearing the upper part.
- bool HasFastPartialYMMorZMMWrite = false;
+ /// True if vzeroupper instructions should be inserted after code that uses
+ /// ymm or zmm registers.
+ bool InsertVZEROUPPER = false;
/// True if there is no performance penalty for writing NOPs with up to
/// 11 bytes.
@@ -658,9 +658,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
bool hasFastVariableShuffle() const {
return HasFastVariableShuffle;
}
- bool hasFastPartialYMMorZMMWrite() const {
- return HasFastPartialYMMorZMMWrite;
- }
+ bool insertVZEROUPPER() const { return InsertVZEROUPPER; }
bool hasFastGather() const { return HasFastGather; }
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index b06512921fbb..1680239eda4e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -51,7 +51,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::FeatureFastBEXTR,
X86::FeatureFastHorizontalOps,
X86::FeatureFastLZCNT,
- X86::FeatureFastPartialYMMorZMMWrite,
X86::FeatureFastScalarFSQRT,
X86::FeatureFastSHLDRotate,
X86::FeatureFastScalarShiftMasks,
@@ -78,6 +77,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::FeatureSlowTwoMemOps,
X86::FeatureSlowUAMem16,
X86::FeaturePreferMaskRegisters,
+ X86::FeatureInsertVZEROUPPER,
// Perf-tuning flags.
X86::FeatureHasFastGather,
diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp
index 9280d030b5d5..7a8308ef1ba9 100644
--- a/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -279,7 +279,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
/// function calls.
bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite())
+ if (!ST.hasAVX() || !ST.insertVZEROUPPER())
return false;
TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/llvm/test/CodeGen/X86/avx-vzeroupper.ll b/llvm/test/CodeGen/X86/avx-vzeroupper.ll
index 1abeb76de0f4..4ae9b3144a34 100644
--- a/llvm/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/llvm/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,-vzeroupper | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=DISABLE-VZ
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BDVER2
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2
@@ -44,18 +44,18 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
; VZ-NEXT: addq $56, %rsp
; VZ-NEXT: retq
;
-; FAST-ymm-zmm-LABEL: test01:
-; FAST-ymm-zmm: # %bb.0:
-; FAST-ymm-zmm-NEXT: subq $56, %rsp
-; FAST-ymm-zmm-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
-; FAST-ymm-zmm-NEXT: vmovaps {{.*}}(%rip), %xmm0
-; FAST-ymm-zmm-NEXT: callq do_sse
-; FAST-ymm-zmm-NEXT: vmovaps %xmm0, {{.*}}(%rip)
-; FAST-ymm-zmm-NEXT: callq do_sse
-; FAST-ymm-zmm-NEXT: vmovaps %xmm0, {{.*}}(%rip)
-; FAST-ymm-zmm-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; FAST-ymm-zmm-NEXT: addq $56, %rsp
-; FAST-ymm-zmm-NEXT: retq
+; DISABLE-VZ-LABEL: test01:
+; DISABLE-VZ: # %bb.0:
+; DISABLE-VZ-NEXT: subq $56, %rsp
+; DISABLE-VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
+; DISABLE-VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0
+; DISABLE-VZ-NEXT: callq do_sse
+; DISABLE-VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip)
+; DISABLE-VZ-NEXT: callq do_sse
+; DISABLE-VZ-NEXT: vmovaps %xmm0, {{.*}}(%rip)
+; DISABLE-VZ-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; DISABLE-VZ-NEXT: addq $56, %rsp
+; DISABLE-VZ-NEXT: retq
;
; BDVER2-LABEL: test01:
; BDVER2: # %bb.0:
@@ -83,6 +83,7 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
; BTVER2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; BTVER2-NEXT: addq $56, %rsp
; BTVER2-NEXT: retq
+; DISABLE-VZ # %bb.0:
%tmp = load <4 x float>, <4 x float>* @x, align 16
%call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
store <4 x float> %call, <4 x float>* @x, align 16
@@ -100,10 +101,10 @@ define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
; VZ-NEXT: vzeroupper
; VZ-NEXT: jmp do_sse # TAILCALL
;
-; FAST-ymm-zmm-LABEL: test02:
-; FAST-ymm-zmm: # %bb.0:
-; FAST-ymm-zmm-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; FAST-ymm-zmm-NEXT: jmp do_sse # TAILCALL
+; DISABLE-VZ-LABEL: test02:
+; DISABLE-VZ: # %bb.0:
+; DISABLE-VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; DISABLE-VZ-NEXT: jmp do_sse # TAILCALL
;
; BDVER2-LABEL: test02:
; BDVER2: # %bb.0:
@@ -154,34 +155,34 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
; VZ-NEXT: popq %rbx
; VZ-NEXT: retq
;
-; FAST-ymm-zmm-LABEL: test03:
-; FAST-ymm-zmm: # %bb.0: # %entry
-; FAST-ymm-zmm-NEXT: pushq %rbx
-; FAST-ymm-zmm-NEXT: subq $16, %rsp
-; FAST-ymm-zmm-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; FAST-ymm-zmm-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; FAST-ymm-zmm-NEXT: .p2align 4, 0x90
-; FAST-ymm-zmm-NEXT: .LBB3_1: # %while.cond
-; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1
-; FAST-ymm-zmm-NEXT: callq foo
-; FAST-ymm-zmm-NEXT: testl %eax, %eax
-; FAST-ymm-zmm-NEXT: jne .LBB3_1
-; FAST-ymm-zmm-NEXT: # %bb.2: # %for.body.preheader
-; FAST-ymm-zmm-NEXT: movl $4, %ebx
-; FAST-ymm-zmm-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; FAST-ymm-zmm-NEXT: .p2align 4, 0x90
-; FAST-ymm-zmm-NEXT: .LBB3_3: # %for.body
-; FAST-ymm-zmm-NEXT: # =>This Inner Loop Header: Depth=1
-; FAST-ymm-zmm-NEXT: callq do_sse
-; FAST-ymm-zmm-NEXT: callq do_sse
-; FAST-ymm-zmm-NEXT: vmovaps g+{{.*}}(%rip), %xmm0
-; FAST-ymm-zmm-NEXT: callq do_sse
-; FAST-ymm-zmm-NEXT: decl %ebx
-; FAST-ymm-zmm-NEXT: jne .LBB3_3
-; FAST-ymm-zmm-NEXT: # %bb.4: # %for.end
-; FAST-ymm-zmm-NEXT: addq $16, %rsp
-; FAST-ymm-zmm-NEXT: popq %rbx
-; FAST-ymm-zmm-NEXT: retq
+; DISABLE-VZ-LABEL: test03:
+; DISABLE-VZ: # %bb.0: # %entry
+; DISABLE-VZ-NEXT: pushq %rbx
+; DISABLE-VZ-NEXT: subq $16, %rsp
+; DISABLE-VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; DISABLE-VZ-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; DISABLE-VZ-NEXT: .p2align 4, 0x90
+; DISABLE-VZ-NEXT: .LBB3_1: # %while.cond
+; DISABLE-VZ-NEXT: # =>This Inner Loop Header: Depth=1
+; DISABLE-VZ-NEXT: callq foo
+; DISABLE-VZ-NEXT: testl %eax, %eax
+; DISABLE-VZ-NEXT: jne .LBB3_1
+; DISABLE-VZ-NEXT: # %bb.2: # %for.body.preheader
+; DISABLE-VZ-NEXT: movl $4, %ebx
+; DISABLE-VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; DISABLE-VZ-NEXT: .p2align 4, 0x90
+; DISABLE-VZ-NEXT: .LBB3_3: # %for.body
+; DISABLE-VZ-NEXT: # =>This Inner Loop Header: Depth=1
+; DISABLE-VZ-NEXT: callq do_sse
+; DISABLE-VZ-NEXT: callq do_sse
+; DISABLE-VZ-NEXT: vmovaps g+{{.*}}(%rip), %xmm0
+; DISABLE-VZ-NEXT: callq do_sse
+; DISABLE-VZ-NEXT: decl %ebx
+; DISABLE-VZ-NEXT: jne .LBB3_3
+; DISABLE-VZ-NEXT: # %bb.4: # %for.end
+; DISABLE-VZ-NEXT: addq $16, %rsp
+; DISABLE-VZ-NEXT: popq %rbx
+; DISABLE-VZ-NEXT: retq
;
; BDVER2-LABEL: test03:
; BDVER2: # %bb.0: # %entry
@@ -279,15 +280,15 @@ define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind {
; VZ-NEXT: vzeroupper
; VZ-NEXT: retq
;
-; FAST-ymm-zmm-LABEL: test04:
-; FAST-ymm-zmm: # %bb.0:
-; FAST-ymm-zmm-NEXT: pushq %rax
-; FAST-ymm-zmm-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; FAST-ymm-zmm-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; FAST-ymm-zmm-NEXT: callq do_avx
-; FAST-ymm-zmm-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; FAST-ymm-zmm-NEXT: popq %rax
-; FAST-ymm-zmm-NEXT: retq
+; DISABLE-VZ-LABEL: test04:
+; DISABLE-VZ: # %bb.0:
+; DISABLE-VZ-NEXT: pushq %rax
+; DISABLE-VZ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; DISABLE-VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; DISABLE-VZ-NEXT: callq do_avx
+; DISABLE-VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; DISABLE-VZ-NEXT: popq %rax
+; DISABLE-VZ-NEXT: retq
;
; BDVER2-LABEL: test04:
; BDVER2: # %bb.0:
More information about the cfe-commits
mailing list