[llvm] [X86] Fast AVX-512-VNNI vpdpwssd tuning (PR #85033)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 13 00:19:39 PDT 2024
https://github.com/ganeshgit created https://github.com/llvm/llvm-project/pull/85033
Adding a tuning feature to resolve 84182. Generation of vpdpwssd (instead of vpmaddwd + vpaddd sequence) will be retained when the tuning is enabled.
>From 7fdcfd61e3106e8f98f38699f1355d1bc2e69c6f Mon Sep 17 00:00:00 2001
From: Ganesh Gopalasubramanian <Ganesh.Gopalasubramanian at amd.com>
Date: Wed, 13 Mar 2024 12:08:46 +0530
Subject: [PATCH] [X86] Fast AVX-512-VNNI vpdpwssd tuning
---
llvm/lib/Target/X86/X86.td | 12 +++++++++++-
llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
llvm/lib/Target/X86/X86InstrInfo.cpp | 13 +++++++++----
llvm/lib/Target/X86/X86InstrPredicates.td | 1 +
llvm/lib/Target/X86/X86TargetTransformInfo.h | 1 +
llvm/test/CodeGen/X86/vpdpwssd.ll | 15 +++++++++++++++
6 files changed, 38 insertions(+), 6 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/vpdpwssd.ll
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8367f938c0ddfa..3d426ec612bb2a 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -683,6 +683,12 @@ def TuningFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
"Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
+// Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
+def TuningFastPWSSD
+ : SubtargetFeature<
+ "fast-pwssd", "HasFastPWSSD", "true",
+ "Prefer vpdpwssd instruction over vpmaddwd+vpaddd instruction sequence">;
+
def TuningPreferNoGather
: SubtargetFeature<"prefer-no-gather", "PreferGather", "false",
"Prefer no gather instructions">;
@@ -1502,7 +1508,11 @@ def ProcessorFeatures {
!listconcat(ZN2Tuning, ZN3AdditionalTuning);
list<SubtargetFeature> ZN3Features =
!listconcat(ZN2Features, ZN3AdditionalFeatures);
- list<SubtargetFeature> ZN4Tuning = ZN3Tuning;
+
+
+ list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastPWSSD];
+ list<SubtargetFeature> ZN4Tuning =
+ !listconcat(ZN3Tuning, ZN4AdditionalTuning);
list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
FeatureEVEX512,
FeatureCDI,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2b5e3c0379a138..2367591152f7e1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48215,7 +48215,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
// We do not split for SSE at all, but we need to split vectors for AVX1 and
// AVX2.
- if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
+ if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) {
SDValue LoX, HiX;
std::tie(LoX, HiX) = splitVector(X, DAG, DL);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b65f49527ae5dd..57010f1b02e1f1 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10572,8 +10572,11 @@ bool X86InstrInfo::getMachineCombinerPatterns(
case X86::VPDPWSSDrm:
case X86::VPDPWSSDYrr:
case X86::VPDPWSSDYrm: {
- Patterns.push_back(MachineCombinerPattern::DPWSSD);
- return true;
+ if (!Subtarget.hasFastPWSSD()) {
+ Patterns.push_back(MachineCombinerPattern::DPWSSD);
+ return true;
+ }
+ return false;
}
case X86::VPDPWSSDZ128r:
case X86::VPDPWSSDZ128m:
@@ -10581,9 +10584,11 @@ bool X86InstrInfo::getMachineCombinerPatterns(
case X86::VPDPWSSDZ256m:
case X86::VPDPWSSDZr:
case X86::VPDPWSSDZm: {
- if (Subtarget.hasBWI())
+ if (Subtarget.hasBWI() && !Subtarget.hasFastPWSSD()) {
Patterns.push_back(MachineCombinerPattern::DPWSSD);
- return true;
+ return true;
+ }
+ return false;
}
}
}
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7dd51ba6c027ae..9fd846dd05c5d0 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -238,5 +238,6 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
+def HasFastPWSSD: Predicate<"Subtarget->hasFastPWSSD()">;
def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 1a5e6bc886aa67..691e318cda3d33 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -94,6 +94,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::TuningNoDomainDelayBlend,
X86::TuningPreferShiftShuffle,
X86::TuningFastImmVectorShift,
+ X86::TuningFastPWSSD,
// Perf-tuning flags.
X86::TuningFastGather,
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
new file mode 100644
index 00000000000000..a044378fae255c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
+; CHECK-LABEL: vpdpwssd_test:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
+ ret <16 x i32> %4
+}
+
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
More information about the llvm-commits
mailing list