[llvm] 61fadd0 - [X86] Fast AVX-512-VNNI vpdpwssd tuning (#85375)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 15 04:15:44 PDT 2024
Author: Ganesh
Date: 2024-03-15T16:45:41+05:30
New Revision: 61fadd0b09fb012b628b050725d348ad2164f328
URL: https://github.com/llvm/llvm-project/commit/61fadd0b09fb012b628b050725d348ad2164f328
DIFF: https://github.com/llvm/llvm-project/commit/61fadd0b09fb012b628b050725d348ad2164f328.diff
LOG: [X86] Fast AVX-512-VNNI vpdpwssd tuning (#85375)
Adding a tuning feature to fix
https://github.com/llvm/llvm-project/issues/84182
Generates vpdpwssd (instead of vpmaddwd + vpaddd sequence)
Added:
llvm/test/CodeGen/X86/vpdpwssd.ll
Modified:
llvm/lib/Target/X86/X86.td
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86InstrPredicates.td
llvm/lib/Target/X86/X86TargetTransformInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8367f938c0ddfa..78bc043911f2fc 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -683,6 +683,12 @@ def TuningFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
"Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
+// Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
+def TuningFastDPWSSD
+ : SubtargetFeature<
+ "fast-dpwssd", "HasFastDPWSSD", "true",
+ "Prefer vpdpwssd instruction over vpmaddwd+vpaddd instruction sequence">;
+
def TuningPreferNoGather
: SubtargetFeature<"prefer-no-gather", "PreferGather", "false",
"Prefer no gather instructions">;
@@ -1502,7 +1508,11 @@ def ProcessorFeatures {
!listconcat(ZN2Tuning, ZN3AdditionalTuning);
list<SubtargetFeature> ZN3Features =
!listconcat(ZN2Features, ZN3AdditionalFeatures);
- list<SubtargetFeature> ZN4Tuning = ZN3Tuning;
+
+
+ list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastDPWSSD];
+ list<SubtargetFeature> ZN4Tuning =
+ !listconcat(ZN3Tuning, ZN4AdditionalTuning);
list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
FeatureEVEX512,
FeatureCDI,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b65f49527ae5dd..43ae6fd590745c 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10565,15 +10565,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
bool DoRegPressureReduce) const {
unsigned Opc = Root.getOpcode();
switch (Opc) {
- default:
- return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
- DoRegPressureReduce);
case X86::VPDPWSSDrr:
case X86::VPDPWSSDrm:
case X86::VPDPWSSDYrr:
case X86::VPDPWSSDYrm: {
- Patterns.push_back(MachineCombinerPattern::DPWSSD);
- return true;
+ if (!Subtarget.hasFastDPWSSD()) {
+ Patterns.push_back(MachineCombinerPattern::DPWSSD);
+ return true;
+ }
+ break;
}
case X86::VPDPWSSDZ128r:
case X86::VPDPWSSDZ128m:
@@ -10581,11 +10581,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
case X86::VPDPWSSDZ256m:
case X86::VPDPWSSDZr:
case X86::VPDPWSSDZm: {
- if (Subtarget.hasBWI())
+ if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
Patterns.push_back(MachineCombinerPattern::DPWSSD);
- return true;
+ return true;
+ }
+ break;
}
}
+ return TargetInstrInfo::getMachineCombinerPatterns(Root,
+ Patterns, DoRegPressureReduce);
}
static void
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7dd51ba6c027ae..b8e7768bdaf3c4 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -238,5 +238,6 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
+def HasFastDPWSSD: Predicate<"Subtarget->hasFastDPWSSD()">;
def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 1a5e6bc886aa67..23035f655098a7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -94,6 +94,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::TuningNoDomainDelayBlend,
X86::TuningPreferShiftShuffle,
X86::TuningFastImmVectorShift,
+ X86::TuningFastDPWSSD,
// Perf-tuning flags.
X86::TuningFastGather,
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
new file mode 100644
index 00000000000000..e6a07b4aeb2719
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
+
+define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
+; CHECK-LABEL: vpdpwssd_test:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
+ ret <16 x i32> %4
+}
More information about the llvm-commits
mailing list