[llvm] cecaf29 - Adding tuning flags for int <-> fp domain switching penalties; NFC
Noah Goldstein via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 27 16:53:52 PST 2023
Author: Noah Goldstein
Date: 2023-02-27T18:53:25-06:00
New Revision: cecaf295898f6bb23b052892c1d06c27f2715b0d
URL: https://github.com/llvm/llvm-project/commit/cecaf295898f6bb23b052892c1d06c27f2715b0d
DIFF: https://github.com/llvm/llvm-project/commit/cecaf295898f6bb23b052892c1d06c27f2715b0d.diff
LOG: Adding tuning flags for int <-> fp domain switching penalties; NFC
Atom
- No domain switching penalties
Nehalem+
- No penalty on moves
Haswell+
- No penalty on moves / shuffles
Skylake+
- No penality on moves / shuffles / blends
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D143859
Added:
Modified:
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86Subtarget.h
llvm/lib/Target/X86/X86TargetTransformInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 5da38d8a5dcc..26934620814a 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -527,6 +527,29 @@ def TuningFastVariablePerLaneShuffle
"HasFastVariablePerLaneShuffle",
"true", "Per-lane shuffles with variable masks are fast">;
+// Goldmont / Tremont (atom in general) has no bypass delay
+def TuningNoDomainDelay : SubtargetFeature<"no-bypass-delay",
+ "NoDomainDelay","true",
+ "Has no bypass delay when using the 'wrong' domain">;
+
+// Many processors (Nehalem+ on Intel) have no bypass delay when
+// using the wrong mov type.
+def TuningNoDomainDelayMov : SubtargetFeature<"no-bypass-delay-mov",
+ "NoDomainDelayMov","true",
+ "Has no bypass delay when using the 'wrong' mov type">;
+
+// Newer processors (Skylake+ on Intel) have no bypass delay when
+// using the wrong blend type.
+def TuningNoDomainDelayBlend : SubtargetFeature<"no-bypass-delay-blend",
+ "NoDomainDelayBlend","true",
+ "Has no bypass delay when using the 'wrong' blend type">;
+
+// Newer processors (Haswell+ on Intel) have no bypass delay when
+// using the wrong shuffle type.
+def TuningNoDomainDelayShuffle : SubtargetFeature<"no-bypass-delay-shuffle",
+ "NoDomainDelayShuffle","true",
+ "Has no bypass delay when using the 'wrong' shuffle type">;
+
// On some X86 processors, a vzeroupper instruction should be inserted after
// using ymm/zmm registers before executing code that may use SSE instructions.
def TuningInsertVZEROUPPER
@@ -781,7 +804,8 @@ def ProcessorFeatures {
// Nehalem
list<SubtargetFeature> NHMFeatures = X86_64V2Features;
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningNoDomainDelayMov];
// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -801,7 +825,8 @@ def ProcessorFeatures {
TuningFastSHLDRotate,
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningNoDomainDelayMov];
list<SubtargetFeature> SNBFeatures =
!listconcat(WSMFeatures, SNBAdditionalFeatures);
@@ -833,7 +858,9 @@ def ProcessorFeatures {
TuningPOPCNTFalseDeps,
TuningLZCNTFalseDeps,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit];
+ TuningAllowLight256Bit,
+ TuningNoDomainDelayMov,
+ TuningNoDomainDelayShuffle];
list<SubtargetFeature> HSWFeatures =
!listconcat(IVBFeatures, HSWAdditionalFeatures);
@@ -862,7 +889,10 @@ def ProcessorFeatures {
TuningFastVariablePerLaneShuffle,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit];
+ TuningAllowLight256Bit,
+ TuningNoDomainDelayMov,
+ TuningNoDomainDelayShuffle,
+ TuningNoDomainDelayBlend];
list<SubtargetFeature> SKLFeatures =
!listconcat(BDWFeatures, SKLAdditionalFeatures);
@@ -891,7 +921,10 @@ def ProcessorFeatures {
TuningPrefer256Bit,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit];
+ TuningAllowLight256Bit,
+ TuningNoDomainDelayMov,
+ TuningNoDomainDelayShuffle,
+ TuningNoDomainDelayBlend];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);
@@ -929,7 +962,10 @@ def ProcessorFeatures {
TuningFastVariablePerLaneShuffle,
TuningPrefer256Bit,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit];
+ TuningAllowLight256Bit,
+ TuningNoDomainDelayMov,
+ TuningNoDomainDelayShuffle,
+ TuningNoDomainDelayBlend];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);
@@ -954,7 +990,10 @@ def ProcessorFeatures {
TuningFastVariablePerLaneShuffle,
TuningPrefer256Bit,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit];
+ TuningAllowLight256Bit,
+ TuningNoDomainDelayMov,
+ TuningNoDomainDelayShuffle,
+ TuningNoDomainDelayBlend];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);
@@ -1028,7 +1067,8 @@ def ProcessorFeatures {
TuningSlowTwoMemOps,
TuningLEAUsesAG,
TuningPadShortFunctions,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningNoDomainDelay];
// Silvermont
list<SubtargetFeature> SLMAdditionalFeatures = [FeatureSSE42,
@@ -1046,7 +1086,8 @@ def ProcessorFeatures {
TuningFast7ByteNOP,
TuningFastMOVBE,
TuningPOPCNTFalseDeps,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningNoDomainDelay];
list<SubtargetFeature> SLMFeatures =
!listconcat(AtomFeatures, SLMAdditionalFeatures);
@@ -1066,7 +1107,8 @@ def ProcessorFeatures {
TuningSlowIncDec,
TuningFastMOVBE,
TuningPOPCNTFalseDeps,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningNoDomainDelay];
list<SubtargetFeature> GLMFeatures =
!listconcat(SLMFeatures, GLMAdditionalFeatures);
@@ -1078,7 +1120,8 @@ def ProcessorFeatures {
TuningSlowLEA,
TuningSlowIncDec,
TuningFastMOVBE,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningNoDomainDelay];
list<SubtargetFeature> GLPFeatures =
!listconcat(GLMFeatures, GLPAdditionalFeatures);
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index f75323b733e0..4c11a4212c31 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -249,6 +249,17 @@ class X86Subtarget final : public X86GenSubtargetInfo {
return hasBWI() && canExtendTo512DQ();
}
+ bool hasNoDomainDelay() const { return NoDomainDelay; }
+ bool hasNoDomainDelayMov() const {
+ return hasNoDomainDelay() || NoDomainDelayMov;
+ }
+ bool hasNoDomainDelayBlend() const {
+ return hasNoDomainDelay() || NoDomainDelayBlend;
+ }
+ bool hasNoDomainDelayShuffle() const {
+ return hasNoDomainDelay() || NoDomainDelayShuffle;
+ }
+
// If there are no 512-bit vectors and we prefer not to use 512-bit registers,
// disable them in the legalizer.
bool useAVX512Regs() const {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 2034cf8e577c..ee6159de44b9 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -88,6 +88,10 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::TuningInsertVZEROUPPER,
X86::TuningUseSLMArithCosts,
X86::TuningUseGLMDivSqrtCosts,
+ X86::TuningNoDomainDelay,
+ X86::TuningNoDomainDelayMov,
+ X86::TuningNoDomainDelayShuffle,
+ X86::TuningNoDomainDelayBlend,
// Perf-tuning flags.
X86::TuningFastGather,
More information about the llvm-commits
mailing list