[llvm] 8f7f9d8 - [X86] Machine combine vnni instruction.
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 27 01:42:50 PDT 2023
Author: Luo, Yuanke
Date: 2023-04-27T16:42:04+08:00
New Revision: 8f7f9d86a7555263ef08fded15a6b778d796ec3f
URL: https://github.com/llvm/llvm-project/commit/8f7f9d86a7555263ef08fded15a6b778d796ec3f
DIFF: https://github.com/llvm/llvm-project/commit/8f7f9d86a7555263ef08fded15a6b778d796ec3f.diff
LOG: [X86] Machine combine vnni instruction.
"vpmaddwd + vpaddd" can be combined to vpdpwssd and the latency is
reduced after combination. However when vpdpwssd is in a critical path
the combination get less ILP. It happens when vpdpwssd is in a loop, the
vpmaddwd can be executed in parallel in multi-iterations while vpdpwssd
has data dependency for each iterations. If vpaddd is in a critical path
while vpmaddwd is not, it is profitable to split vpdpwssd into "vpmaddwd
+ vpaddd".
This patch is based on the machine combiner framework to acheive decision
on "vpmaddwd + vpaddd" combination. The typical example code is as
below.
```
__m256i foo(int cnt, __m256i c, __m256i b, __m256i *p) {
for (int i = 0; i < cnt; ++i) {
__m256i a = p[i];
__m256i m = _mm256_madd_epi16 (b, a);
c = _mm256_add_epi32(m, c);
}
return c;
}
```
Differential Revision: https://reviews.llvm.org/D148980
Added:
Modified:
llvm/include/llvm/CodeGen/MachineCombinerPattern.h
llvm/include/llvm/CodeGen/TargetInstrInfo.h
llvm/lib/CodeGen/MachineCombiner.cpp
llvm/lib/Target/X86/X86InstrInfo.cpp
llvm/lib/Target/X86/X86InstrInfo.h
llvm/test/CodeGen/X86/avx512vnni-combine.ll
llvm/test/CodeGen/X86/avxvnni-combine.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
index 39e70d583710e..5be436b69a5b9 100644
--- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -175,6 +175,9 @@ enum class MachineCombinerPattern {
FMADD_XA,
FMSUB,
FNMSUB,
+
+ // X86 VNNI
+ DPWSSD,
};
} // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 5aa3e65445894..7d8265ad413c2 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1223,6 +1223,13 @@ class TargetInstrInfo : public MCInstrInfo {
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const;
+ /// When calculate the latency of the root instruction, accumulate the
+ /// latency of the sequence to the root latency.
+ /// \param Root - Instruction that could be combined with one of its operands
+ virtual bool accumulateInstrSeqToRootLatency(MachineInstr &Root) const {
+ return true;
+ }
+
/// Attempt to reassociate \P Root and \P Prev according to \P Pattern to
/// reduce critical path length.
void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index bb4d1316dd252..5c58d3b446925 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -91,7 +91,8 @@ class MachineCombiner : public MachineFunctionPass {
private:
bool combineInstructions(MachineBasicBlock *);
- MachineInstr *getOperandDef(const MachineOperand &MO);
+ MachineInstr *getOperandDef(const MachineOperand &MO,
+ SmallVectorImpl<MachineInstr *> &InsInstrs);
bool isTransientMI(const MachineInstr *MI);
unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
@@ -149,11 +150,29 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
MachineFunctionPass::getAnalysisUsage(AU);
}
-MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
+MachineInstr *
+MachineCombiner::getOperandDef(const MachineOperand &MO,
+ SmallVectorImpl<MachineInstr *> &InsInstrs) {
MachineInstr *DefInstr = nullptr;
// We need a virtual register definition.
if (MO.isReg() && MO.getReg().isVirtual())
DefInstr = MRI->getUniqueVRegDef(MO.getReg());
+ // Since the new instructions are not inserted into the machine function,
+ // the def-use information is not added in MRI. So it is possible that
+ // the register is defined in new instructions.
+ if (!DefInstr) {
+ for (auto *MI : InsInstrs) {
+ for (const MachineOperand &DefMO : MI->operands()) {
+ if (!(DefMO.isReg() && DefMO.getReg().isVirtual()))
+ continue;
+ if (!DefMO.isDef())
+ continue;
+ if (DefMO.getReg() != MO.getReg())
+ continue;
+ DefInstr = MI;
+ }
+ }
+ }
// PHI's have no depth etc.
if (DefInstr && DefInstr->isPHI())
DefInstr = nullptr;
@@ -238,7 +257,7 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
InstrPtr, UseIdx);
} else {
- MachineInstr *DefInstr = getOperandDef(MO);
+ MachineInstr *DefInstr = getOperandDef(MO, InsInstrs);
if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
MachineTraceStrategy::TS_Local ||
DefInstr->getParent() == &MBB)) {
@@ -404,8 +423,13 @@ bool MachineCombiner::improvesCriticalPathLen(
// Account for the latency of the inserted and deleted instructions by
unsigned NewRootLatency, RootLatency;
- std::tie(NewRootLatency, RootLatency) =
- getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
+ if (TII->accumulateInstrSeqToRootLatency(*Root)) {
+ std::tie(NewRootLatency, RootLatency) =
+ getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
+ } else {
+ NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
+ RootLatency = TSchedModel.computeInstrLatency(Root);
+ }
unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
unsigned NewCycleCount = NewRootDepth + NewRootLatency;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 004104c860fd7..2ee9ab6f95f34 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -22,6 +22,7 @@
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -9749,5 +9750,141 @@ X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
return It;
}
+bool X86InstrInfo::getMachineCombinerPatterns(
+ MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+ bool DoRegPressureReduce) const {
+ unsigned Opc = Root.getOpcode();
+ switch (Opc) {
+ default:
+ return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+ DoRegPressureReduce);
+ case X86::VPDPWSSDrr:
+ case X86::VPDPWSSDrm:
+ case X86::VPDPWSSDYrr:
+ case X86::VPDPWSSDYrm: {
+ Patterns.push_back(MachineCombinerPattern::DPWSSD);
+ return true;
+ }
+ case X86::VPDPWSSDZ128r:
+ case X86::VPDPWSSDZ128m:
+ case X86::VPDPWSSDZ256r:
+ case X86::VPDPWSSDZ256m:
+ case X86::VPDPWSSDZr:
+ case X86::VPDPWSSDZm: {
+ if (Subtarget.hasBWI())
+ Patterns.push_back(MachineCombinerPattern::DPWSSD);
+ return true;
+ }
+ }
+}
+
+static void
+genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
+ MachineFunction *MF = Root.getMF();
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+
+ unsigned Opc = Root.getOpcode();
+ unsigned AddOpc;
+ unsigned MaddOpc;
+ switch (Opc) {
+ default:
+ assert("It should not reach here");
+ break;
+ // vpdpwssd xmm2,xmm3,xmm1
+ // -->
+ // vpmaddwd xmm3,xmm3,xmm1
+ // vpaddd xmm2,xmm2,xmm3
+ case X86::VPDPWSSDrr:
+ MaddOpc = X86::VPMADDWDrr;
+ AddOpc = X86::VPADDDrr;
+ break;
+ case X86::VPDPWSSDrm:
+ MaddOpc = X86::VPMADDWDrm;
+ AddOpc = X86::VPADDDrr;
+ break;
+ case X86::VPDPWSSDZ128r:
+ MaddOpc = X86::VPMADDWDZ128rr;
+ AddOpc = X86::VPADDDZ128rr;
+ break;
+ case X86::VPDPWSSDZ128m:
+ MaddOpc = X86::VPMADDWDZ128rm;
+ AddOpc = X86::VPADDDZ128rr;
+ break;
+ // vpdpwssd ymm2,ymm3,ymm1
+ // -->
+ // vpmaddwd ymm3,ymm3,ymm1
+ // vpaddd ymm2,ymm2,ymm3
+ case X86::VPDPWSSDYrr:
+ MaddOpc = X86::VPMADDWDYrr;
+ AddOpc = X86::VPADDDYrr;
+ break;
+ case X86::VPDPWSSDYrm:
+ MaddOpc = X86::VPMADDWDYrm;
+ AddOpc = X86::VPADDDYrr;
+ break;
+ case X86::VPDPWSSDZ256r:
+ MaddOpc = X86::VPMADDWDZ256rr;
+ AddOpc = X86::VPADDDZ256rr;
+ break;
+ case X86::VPDPWSSDZ256m:
+ MaddOpc = X86::VPMADDWDZ256rm;
+ AddOpc = X86::VPADDDZ256rr;
+ break;
+ // vpdpwssd zmm2,zmm3,zmm1
+ // -->
+ // vpmaddwd zmm3,zmm3,zmm1
+ // vpaddd zmm2,zmm2,zmm3
+ case X86::VPDPWSSDZr:
+ MaddOpc = X86::VPMADDWDZrr;
+ AddOpc = X86::VPADDDZrr;
+ break;
+ case X86::VPDPWSSDZm:
+ MaddOpc = X86::VPMADDWDZrm;
+ AddOpc = X86::VPADDDZrr;
+ break;
+ }
+ // Create vpmaddwd.
+ const TargetRegisterClass *RC =
+ RegInfo.getRegClass(Root.getOperand(0).getReg());
+ Register NewReg = RegInfo.createVirtualRegister(RC);
+ MachineInstr *Madd = Root.getMF()->CloneMachineInstr(&Root);
+ Madd->setDesc(TII.get(MaddOpc));
+ Madd->untieRegOperand(1);
+ Madd->removeOperand(1);
+ Madd->getOperand(0).setReg(NewReg);
+ // Create vpaddd.
+ Register DstReg = Root.getOperand(0).getReg();
+ bool IsKill = Root.getOperand(1).isKill();
+ MachineInstr *Add =
+ BuildMI(*MF, MIMetadata(Root), TII.get(AddOpc), DstReg)
+ .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill))
+ .addReg(Madd->getOperand(0).getReg(), getKillRegState(true));
+ InstrIdxForVirtReg.insert(std::make_pair(DstReg, 0));
+ InsInstrs.push_back(Madd);
+ InsInstrs.push_back(Add);
+ DelInstrs.push_back(&Root);
+}
+
+void X86InstrInfo::genAlternativeCodeSequence(
+ MachineInstr &Root, MachineCombinerPattern Pattern,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+ switch (Pattern) {
+ default:
+ // Reassociate instructions.
+ TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
+ DelInstrs, InstrIdxForVirtReg);
+ return;
+ case MachineCombinerPattern::DPWSSD:
+ genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs,
+ InstrIdxForVirtReg);
+ return;
+ }
+}
+
#define GET_INSTRINFO_HELPERS
#include "X86GenInstrInfo.inc"
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index f8016b3a98ef4..4d1950e68f311 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -602,6 +602,34 @@ class X86InstrInfo final : public X86GenInstrInfo {
std::optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr &MI) const override;
+ /// Return true when there is potentially a faster code sequence for an
+ /// instruction chain ending in \p Root. All potential patterns are listed in
+ /// the \p Pattern vector. Pattern should be sorted in priority order since
+ /// the pattern evaluator stops checking as soon as it finds a faster
+ /// sequence.
+ bool
+ getMachineCombinerPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &Patterns,
+ bool DoRegPressureReduce) const override;
+
+ /// When getMachineCombinerPatterns() finds potential patterns,
+ /// this function generates the instructions that could replace the
+ /// original code sequence.
+ void genAlternativeCodeSequence(
+ MachineInstr &Root, MachineCombinerPattern Pattern,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+
+ /// When calculate the latency of the root instruction, accumulate the
+ /// latency of the sequence to the root latency.
+ /// \param Root - Instruction that could be combined with one of its operands
+ /// For X86 instruction (vpmaddwd + vpmaddwd) -> vpdpwssd, the vpmaddwd
+ /// is not in the critical path, so the root latency only include vpmaddwd.
+ bool accumulateInstrSeqToRootLatency(MachineInstr &Root) const override {
+ return false;
+ }
+
private:
/// This is a helper for convertToThreeAddress for 8 and 16-bit instructions.
/// We use 32-bit LEA to form 3-address code by promoting to a 32-bit
diff --git a/llvm/test/CodeGen/X86/avx512vnni-combine.ll b/llvm/test/CodeGen/X86/avx512vnni-combine.ll
index 7a8d3af368f01..6a1b25d7f8725 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-combine.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-combine.ll
@@ -1,13 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids -verify-machineinstrs | FileCheck %s
define <8 x i64> @foo_reg_512(<8 x i64> %0, <8 x i64> %1, <8 x i64> %2, <8 x i64> %3, <8 x i64> %4, <8 x i64> %5) {
; CHECK-LABEL: foo_reg_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0
-; CHECK-NEXT: vpdpwssd %zmm3, %zmm1, %zmm0
-; CHECK-NEXT: vpdpwssd %zmm4, %zmm1, %zmm0
-; CHECK-NEXT: vpdpwssd %zmm5, %zmm1, %zmm0
+; CHECK-NEXT: vpmaddwd %zmm3, %zmm1, %zmm2
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpmaddwd %zmm4, %zmm1, %zmm2
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpmaddwd %zmm5, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%7 = bitcast <8 x i64> %0 to <16 x i32>
%8 = bitcast <8 x i64> %1 to <16 x i32>
@@ -54,9 +57,12 @@ define <8 x i64> @foo_512(i32 %0, <8 x i64> %1, <8 x i64> %2, ptr %3) {
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vpdpwssd -192(%rdi), %zmm1, %zmm0
-; CHECK-NEXT: vpdpwssd -128(%rdi), %zmm1, %zmm0
-; CHECK-NEXT: vpdpwssd -64(%rdi), %zmm1, %zmm0
-; CHECK-NEXT: vpdpwssd (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: vpmaddwd -128(%rdi), %zmm1, %zmm2
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpmaddwd -64(%rdi), %zmm1, %zmm2
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vpmaddwd (%rdi), %zmm1, %zmm2
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; CHECK-NEXT: addq $4, %rcx
; CHECK-NEXT: addq $256, %rdi # imm = 0x100
; CHECK-NEXT: cmpq %rcx, %rdx
@@ -179,8 +185,9 @@ define void @bar_512(i32 %0, ptr %1, <8 x i64> %2, ptr %3) {
; CHECK-NEXT: vmovdqa64 (%rsi,%r8), %zmm2
; CHECK-NEXT: vpdpwssd -64(%rdx,%r8), %zmm0, %zmm1
; CHECK-NEXT: vmovdqa64 %zmm1, -64(%rsi,%r8)
-; CHECK-NEXT: vpdpwssd (%rdx,%r8), %zmm0, %zmm2
-; CHECK-NEXT: vmovdqa64 %zmm2, (%rsi,%r8)
+; CHECK-NEXT: vpmaddwd (%rdx,%r8), %zmm0, %zmm1
+; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vmovdqa64 %zmm1, (%rsi,%r8)
; CHECK-NEXT: addq $2, %rcx
; CHECK-NEXT: subq $-128, %r8
; CHECK-NEXT: cmpq %rcx, %rdi
@@ -190,9 +197,9 @@ define void @bar_512(i32 %0, ptr %1, <8 x i64> %2, ptr %3) {
; CHECK-NEXT: je .LBB2_5
; CHECK-NEXT: # %bb.4:
; CHECK-NEXT: shlq $6, %rcx
-; CHECK-NEXT: vmovdqa64 (%rsi,%rcx), %zmm1
-; CHECK-NEXT: vpdpwssd (%rdx,%rcx), %zmm0, %zmm1
-; CHECK-NEXT: vmovdqa64 %zmm1, (%rsi,%rcx)
+; CHECK-NEXT: vpmaddwd (%rdx,%rcx), %zmm0, %zmm0
+; CHECK-NEXT: vpaddd (%rsi,%rcx), %zmm0, %zmm0
+; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi,%rcx)
; CHECK-NEXT: .LBB2_5:
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/avxvnni-combine.ll b/llvm/test/CodeGen/X86/avxvnni-combine.ll
index fc86c80e2441b..3eabd3ff8b6da 100644
--- a/llvm/test/CodeGen/X86/avxvnni-combine.ll
+++ b/llvm/test/CodeGen/X86/avxvnni-combine.ll
@@ -1,23 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids | FileCheck %s --check-prefixes=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake -verify-machineinstrs| FileCheck %s --check-prefixes=AVX,ADL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,SPR
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512
define <2 x i64> @foo_reg_128(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, <2 x i64> %5) {
; AVX-LABEL: foo_reg_128:
; AVX: # %bb.0:
; AVX-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd %xmm3, %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd %xmm4, %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd %xmm5, %xmm1, %xmm0
+; AVX-NEXT: vpmaddwd %xmm3, %xmm1, %xmm2
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpmaddwd %xmm4, %xmm1, %xmm2
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: foo_reg_128:
; AVX512: # %bb.0:
; AVX512-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd %xmm3, %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd %xmm4, %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd %xmm5, %xmm1, %xmm0
+; AVX512-NEXT: vpmaddwd %xmm3, %xmm1, %xmm2
+; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpmaddwd %xmm4, %xmm1, %xmm2
+; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%7 = bitcast <2 x i64> %0 to <4 x i32>
%8 = bitcast <2 x i64> %1 to <4 x i32>
@@ -36,49 +42,99 @@ define <2 x i64> @foo_reg_128(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2, <2 x i64
declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) #1
define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) {
-; AVX-LABEL: foo_128:
-; AVX: # %bb.0:
-; AVX-NEXT: testl %edi, %edi
-; AVX-NEXT: jle .LBB1_6
-; AVX-NEXT: # %bb.1:
-; AVX-NEXT: movl %edi, %edx
-; AVX-NEXT: movl %edx, %eax
-; AVX-NEXT: andl $3, %eax
-; AVX-NEXT: cmpl $4, %edi
-; AVX-NEXT: jae .LBB1_7
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: jmp .LBB1_3
-; AVX-NEXT: .LBB1_7:
-; AVX-NEXT: andl $-4, %edx
-; AVX-NEXT: leaq 48(%rsi), %rdi
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd -32(%rdi), %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd -16(%rdi), %xmm1, %xmm0
-; AVX-NEXT: {vex} vpdpwssd (%rdi), %xmm1, %xmm0
-; AVX-NEXT: addq $4, %rcx
-; AVX-NEXT: addq $64, %rdi
-; AVX-NEXT: cmpq %rcx, %rdx
-; AVX-NEXT: jne .LBB1_8
-; AVX-NEXT: .LBB1_3:
-; AVX-NEXT: testq %rax, %rax
-; AVX-NEXT: je .LBB1_6
-; AVX-NEXT: # %bb.4: # %.preheader
-; AVX-NEXT: shlq $4, %rcx
-; AVX-NEXT: addq %rcx, %rsi
-; AVX-NEXT: shlq $4, %rax
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0
-; AVX-NEXT: addq $16, %rcx
-; AVX-NEXT: cmpq %rcx, %rax
-; AVX-NEXT: jne .LBB1_5
-; AVX-NEXT: .LBB1_6:
-; AVX-NEXT: retq
+; ADL-LABEL: foo_128:
+; ADL: # %bb.0:
+; ADL-NEXT: testl %edi, %edi
+; ADL-NEXT: jle .LBB1_6
+; ADL-NEXT: # %bb.1:
+; ADL-NEXT: movl %edi, %edx
+; ADL-NEXT: movl %edx, %eax
+; ADL-NEXT: andl $3, %eax
+; ADL-NEXT: cmpl $4, %edi
+; ADL-NEXT: jae .LBB1_7
+; ADL-NEXT: # %bb.2:
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: jmp .LBB1_3
+; ADL-NEXT: .LBB1_7:
+; ADL-NEXT: andl $-4, %edx
+; ADL-NEXT: leaq 48(%rsi), %rdi
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0
+; ADL-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2
+; ADL-NEXT: vpmaddwd -16(%rdi), %xmm1, %xmm3
+; ADL-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; ADL-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; ADL-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2
+; ADL-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; ADL-NEXT: addq $4, %rcx
+; ADL-NEXT: addq $64, %rdi
+; ADL-NEXT: cmpq %rcx, %rdx
+; ADL-NEXT: jne .LBB1_8
+; ADL-NEXT: .LBB1_3:
+; ADL-NEXT: testq %rax, %rax
+; ADL-NEXT: je .LBB1_6
+; ADL-NEXT: # %bb.4: # %.preheader
+; ADL-NEXT: shlq $4, %rcx
+; ADL-NEXT: addq %rcx, %rsi
+; ADL-NEXT: shlq $4, %rax
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0
+; ADL-NEXT: addq $16, %rcx
+; ADL-NEXT: cmpq %rcx, %rax
+; ADL-NEXT: jne .LBB1_5
+; ADL-NEXT: .LBB1_6:
+; ADL-NEXT: retq
+;
+; SPR-LABEL: foo_128:
+; SPR: # %bb.0:
+; SPR-NEXT: testl %edi, %edi
+; SPR-NEXT: jle .LBB1_6
+; SPR-NEXT: # %bb.1:
+; SPR-NEXT: movl %edi, %edx
+; SPR-NEXT: movl %edx, %eax
+; SPR-NEXT: andl $3, %eax
+; SPR-NEXT: cmpl $4, %edi
+; SPR-NEXT: jae .LBB1_7
+; SPR-NEXT: # %bb.2:
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: jmp .LBB1_3
+; SPR-NEXT: .LBB1_7:
+; SPR-NEXT: andl $-4, %edx
+; SPR-NEXT: leaq 48(%rsi), %rdi
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0
+; SPR-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2
+; SPR-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; SPR-NEXT: vpmaddwd -16(%rdi), %xmm1, %xmm2
+; SPR-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; SPR-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2
+; SPR-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; SPR-NEXT: addq $4, %rcx
+; SPR-NEXT: addq $64, %rdi
+; SPR-NEXT: cmpq %rcx, %rdx
+; SPR-NEXT: jne .LBB1_8
+; SPR-NEXT: .LBB1_3:
+; SPR-NEXT: testq %rax, %rax
+; SPR-NEXT: je .LBB1_6
+; SPR-NEXT: # %bb.4: # %.preheader
+; SPR-NEXT: shlq $4, %rcx
+; SPR-NEXT: addq %rcx, %rsi
+; SPR-NEXT: shlq $4, %rax
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0
+; SPR-NEXT: addq $16, %rcx
+; SPR-NEXT: cmpq %rcx, %rax
+; SPR-NEXT: jne .LBB1_5
+; SPR-NEXT: .LBB1_6:
+; SPR-NEXT: retq
;
; AVX512-LABEL: foo_128:
; AVX512: # %bb.0:
@@ -100,9 +156,12 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) {
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1
; AVX512-NEXT: vpdpwssd -48(%rdi), %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd -32(%rdi), %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd -16(%rdi), %xmm1, %xmm0
-; AVX512-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0
+; AVX512-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2
+; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpmaddwd -16(%rdi), %xmm1, %xmm2
+; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2
+; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: addq $4, %rcx
; AVX512-NEXT: addq $64, %rdi
; AVX512-NEXT: cmpq %rcx, %rdx
@@ -197,44 +256,84 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) {
}
define void @bar_128(i32 %0, ptr %1, <2 x i64> %2, ptr %3) {
-; AVX-LABEL: bar_128:
-; AVX: # %bb.0:
-; AVX-NEXT: testl %edi, %edi
-; AVX-NEXT: jle .LBB2_5
-; AVX-NEXT: # %bb.1:
-; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: cmpl $1, %edi
-; AVX-NEXT: jne .LBB2_6
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: jmp .LBB2_3
-; AVX-NEXT: .LBB2_6:
-; AVX-NEXT: movl %eax, %edi
-; AVX-NEXT: andl $-2, %edi
-; AVX-NEXT: movl $16, %r8d
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: vmovdqa -16(%rsi,%r8), %xmm1
-; AVX-NEXT: vmovdqa (%rsi,%r8), %xmm2
-; AVX-NEXT: {vex} vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, -16(%rsi,%r8)
-; AVX-NEXT: {vex} vpdpwssd (%rdx,%r8), %xmm0, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, (%rsi,%r8)
-; AVX-NEXT: addq $2, %rcx
-; AVX-NEXT: addq $32, %r8
-; AVX-NEXT: cmpq %rcx, %rdi
-; AVX-NEXT: jne .LBB2_7
-; AVX-NEXT: .LBB2_3:
-; AVX-NEXT: testb $1, %al
-; AVX-NEXT: je .LBB2_5
-; AVX-NEXT: # %bb.4:
-; AVX-NEXT: shlq $4, %rcx
-; AVX-NEXT: vmovdqa (%rsi,%rcx), %xmm1
-; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %xmm0, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rsi,%rcx)
-; AVX-NEXT: .LBB2_5:
-; AVX-NEXT: retq
+; ADL-LABEL: bar_128:
+; ADL: # %bb.0:
+; ADL-NEXT: testl %edi, %edi
+; ADL-NEXT: jle .LBB2_5
+; ADL-NEXT: # %bb.1:
+; ADL-NEXT: movl %edi, %eax
+; ADL-NEXT: cmpl $1, %edi
+; ADL-NEXT: jne .LBB2_6
+; ADL-NEXT: # %bb.2:
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: jmp .LBB2_3
+; ADL-NEXT: .LBB2_6:
+; ADL-NEXT: movl %eax, %edi
+; ADL-NEXT: andl $-2, %edi
+; ADL-NEXT: movl $16, %r8d
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: vmovdqa (%rsi,%r8), %xmm1
+; ADL-NEXT: vpmaddwd -16(%rdx,%r8), %xmm0, %xmm2
+; ADL-NEXT: vpaddd -16(%rsi,%r8), %xmm2, %xmm2
+; ADL-NEXT: vmovdqa %xmm2, -16(%rsi,%r8)
+; ADL-NEXT: {vex} vpdpwssd (%rdx,%r8), %xmm0, %xmm1
+; ADL-NEXT: vmovdqa %xmm1, (%rsi,%r8)
+; ADL-NEXT: addq $2, %rcx
+; ADL-NEXT: addq $32, %r8
+; ADL-NEXT: cmpq %rcx, %rdi
+; ADL-NEXT: jne .LBB2_7
+; ADL-NEXT: .LBB2_3:
+; ADL-NEXT: testb $1, %al
+; ADL-NEXT: je .LBB2_5
+; ADL-NEXT: # %bb.4:
+; ADL-NEXT: shlq $4, %rcx
+; ADL-NEXT: vmovdqa (%rsi,%rcx), %xmm1
+; ADL-NEXT: {vex} vpdpwssd (%rdx,%rcx), %xmm0, %xmm1
+; ADL-NEXT: vmovdqa %xmm1, (%rsi,%rcx)
+; ADL-NEXT: .LBB2_5:
+; ADL-NEXT: retq
+;
+; SPR-LABEL: bar_128:
+; SPR: # %bb.0:
+; SPR-NEXT: testl %edi, %edi
+; SPR-NEXT: jle .LBB2_5
+; SPR-NEXT: # %bb.1:
+; SPR-NEXT: movl %edi, %eax
+; SPR-NEXT: cmpl $1, %edi
+; SPR-NEXT: jne .LBB2_6
+; SPR-NEXT: # %bb.2:
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: jmp .LBB2_3
+; SPR-NEXT: .LBB2_6:
+; SPR-NEXT: movl %eax, %edi
+; SPR-NEXT: andl $-2, %edi
+; SPR-NEXT: movl $16, %r8d
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: vmovdqa -16(%rsi,%r8), %xmm1
+; SPR-NEXT: vmovdqa (%rsi,%r8), %xmm2
+; SPR-NEXT: {vex} vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1
+; SPR-NEXT: vmovdqa %xmm1, -16(%rsi,%r8)
+; SPR-NEXT: vpmaddwd (%rdx,%r8), %xmm0, %xmm1
+; SPR-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; SPR-NEXT: vmovdqa %xmm1, (%rsi,%r8)
+; SPR-NEXT: addq $2, %rcx
+; SPR-NEXT: addq $32, %r8
+; SPR-NEXT: cmpq %rcx, %rdi
+; SPR-NEXT: jne .LBB2_7
+; SPR-NEXT: .LBB2_3:
+; SPR-NEXT: testb $1, %al
+; SPR-NEXT: je .LBB2_5
+; SPR-NEXT: # %bb.4:
+; SPR-NEXT: shlq $4, %rcx
+; SPR-NEXT: vpmaddwd (%rdx,%rcx), %xmm0, %xmm0
+; SPR-NEXT: vpaddd (%rsi,%rcx), %xmm0, %xmm0
+; SPR-NEXT: vmovdqa %xmm0, (%rsi,%rcx)
+; SPR-NEXT: .LBB2_5:
+; SPR-NEXT: retq
;
; AVX512-LABEL: bar_128:
; AVX512: # %bb.0:
@@ -258,8 +357,9 @@ define void @bar_128(i32 %0, ptr %1, <2 x i64> %2, ptr %3) {
; AVX512-NEXT: vmovdqa (%rsi,%r8), %xmm2
; AVX512-NEXT: vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1
; AVX512-NEXT: vmovdqa %xmm1, -16(%rsi,%r8)
-; AVX512-NEXT: vpdpwssd (%rdx,%r8), %xmm0, %xmm2
-; AVX512-NEXT: vmovdqa %xmm2, (%rsi,%r8)
+; AVX512-NEXT: vpmaddwd (%rdx,%r8), %xmm0, %xmm1
+; AVX512-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, (%rsi,%r8)
; AVX512-NEXT: addq $2, %rcx
; AVX512-NEXT: addq $32, %r8
; AVX512-NEXT: cmpq %rcx, %rdi
@@ -269,9 +369,9 @@ define void @bar_128(i32 %0, ptr %1, <2 x i64> %2, ptr %3) {
; AVX512-NEXT: je .LBB2_5
; AVX512-NEXT: # %bb.4:
; AVX512-NEXT: shlq $4, %rcx
-; AVX512-NEXT: vmovdqa (%rsi,%rcx), %xmm1
-; AVX512-NEXT: vpdpwssd (%rdx,%rcx), %xmm0, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, (%rsi,%rcx)
+; AVX512-NEXT: vpmaddwd (%rdx,%rcx), %xmm0, %xmm0
+; AVX512-NEXT: vpaddd (%rsi,%rcx), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi,%rcx)
; AVX512-NEXT: .LBB2_5:
; AVX512-NEXT: retq
%5 = icmp sgt i32 %0, 0
@@ -333,17 +433,23 @@ define <4 x i64> @foo_reg_256(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2, <4 x i64
; AVX-LABEL: foo_reg_256:
; AVX: # %bb.0:
; AVX-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd %ymm3, %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd %ymm4, %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd %ymm5, %ymm1, %ymm0
+; AVX-NEXT: vpmaddwd %ymm3, %ymm1, %ymm2
+; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vpmaddwd %ymm4, %ymm1, %ymm2
+; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1
+; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: foo_reg_256:
; AVX512: # %bb.0:
; AVX512-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd %ymm3, %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd %ymm4, %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd %ymm5, %ymm1, %ymm0
+; AVX512-NEXT: vpmaddwd %ymm3, %ymm1, %ymm2
+; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmaddwd %ymm4, %ymm1, %ymm2
+; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%7 = bitcast <4 x i64> %0 to <8 x i32>
%8 = bitcast <4 x i64> %1 to <8 x i32>
@@ -369,49 +475,99 @@ define <4 x i64> @foo_reg_256(<4 x i64> %0, <4 x i64> %1, <4 x i64> %2, <4 x i64
; }
define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) {
-; AVX-LABEL: foo_256:
-; AVX: # %bb.0:
-; AVX-NEXT: testl %edi, %edi
-; AVX-NEXT: jle .LBB4_6
-; AVX-NEXT: # %bb.1:
-; AVX-NEXT: movl %edi, %edx
-; AVX-NEXT: movl %edx, %eax
-; AVX-NEXT: andl $3, %eax
-; AVX-NEXT: cmpl $4, %edi
-; AVX-NEXT: jae .LBB4_7
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: jmp .LBB4_3
-; AVX-NEXT: .LBB4_7:
-; AVX-NEXT: andl $-4, %edx
-; AVX-NEXT: leaq 96(%rsi), %rdi
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd -64(%rdi), %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd -32(%rdi), %ymm1, %ymm0
-; AVX-NEXT: {vex} vpdpwssd (%rdi), %ymm1, %ymm0
-; AVX-NEXT: addq $4, %rcx
-; AVX-NEXT: subq $-128, %rdi
-; AVX-NEXT: cmpq %rcx, %rdx
-; AVX-NEXT: jne .LBB4_8
-; AVX-NEXT: .LBB4_3:
-; AVX-NEXT: testq %rax, %rax
-; AVX-NEXT: je .LBB4_6
-; AVX-NEXT: # %bb.4: # %.preheader
-; AVX-NEXT: shlq $5, %rcx
-; AVX-NEXT: addq %rcx, %rsi
-; AVX-NEXT: shlq $5, %rax
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0
-; AVX-NEXT: addq $32, %rcx
-; AVX-NEXT: cmpq %rcx, %rax
-; AVX-NEXT: jne .LBB4_5
-; AVX-NEXT: .LBB4_6:
-; AVX-NEXT: retq
+; ADL-LABEL: foo_256:
+; ADL: # %bb.0:
+; ADL-NEXT: testl %edi, %edi
+; ADL-NEXT: jle .LBB4_6
+; ADL-NEXT: # %bb.1:
+; ADL-NEXT: movl %edi, %edx
+; ADL-NEXT: movl %edx, %eax
+; ADL-NEXT: andl $3, %eax
+; ADL-NEXT: cmpl $4, %edi
+; ADL-NEXT: jae .LBB4_7
+; ADL-NEXT: # %bb.2:
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: jmp .LBB4_3
+; ADL-NEXT: .LBB4_7:
+; ADL-NEXT: andl $-4, %edx
+; ADL-NEXT: leaq 96(%rsi), %rdi
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0
+; ADL-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2
+; ADL-NEXT: vpmaddwd -32(%rdi), %ymm1, %ymm3
+; ADL-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; ADL-NEXT: vpaddd %ymm3, %ymm0, %ymm0
+; ADL-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2
+; ADL-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; ADL-NEXT: addq $4, %rcx
+; ADL-NEXT: subq $-128, %rdi
+; ADL-NEXT: cmpq %rcx, %rdx
+; ADL-NEXT: jne .LBB4_8
+; ADL-NEXT: .LBB4_3:
+; ADL-NEXT: testq %rax, %rax
+; ADL-NEXT: je .LBB4_6
+; ADL-NEXT: # %bb.4: # %.preheader
+; ADL-NEXT: shlq $5, %rcx
+; ADL-NEXT: addq %rcx, %rsi
+; ADL-NEXT: shlq $5, %rax
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0
+; ADL-NEXT: addq $32, %rcx
+; ADL-NEXT: cmpq %rcx, %rax
+; ADL-NEXT: jne .LBB4_5
+; ADL-NEXT: .LBB4_6:
+; ADL-NEXT: retq
+;
+; SPR-LABEL: foo_256:
+; SPR: # %bb.0:
+; SPR-NEXT: testl %edi, %edi
+; SPR-NEXT: jle .LBB4_6
+; SPR-NEXT: # %bb.1:
+; SPR-NEXT: movl %edi, %edx
+; SPR-NEXT: movl %edx, %eax
+; SPR-NEXT: andl $3, %eax
+; SPR-NEXT: cmpl $4, %edi
+; SPR-NEXT: jae .LBB4_7
+; SPR-NEXT: # %bb.2:
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: jmp .LBB4_3
+; SPR-NEXT: .LBB4_7:
+; SPR-NEXT: andl $-4, %edx
+; SPR-NEXT: leaq 96(%rsi), %rdi
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0
+; SPR-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2
+; SPR-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; SPR-NEXT: vpmaddwd -32(%rdi), %ymm1, %ymm2
+; SPR-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; SPR-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2
+; SPR-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; SPR-NEXT: addq $4, %rcx
+; SPR-NEXT: subq $-128, %rdi
+; SPR-NEXT: cmpq %rcx, %rdx
+; SPR-NEXT: jne .LBB4_8
+; SPR-NEXT: .LBB4_3:
+; SPR-NEXT: testq %rax, %rax
+; SPR-NEXT: je .LBB4_6
+; SPR-NEXT: # %bb.4: # %.preheader
+; SPR-NEXT: shlq $5, %rcx
+; SPR-NEXT: addq %rcx, %rsi
+; SPR-NEXT: shlq $5, %rax
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0
+; SPR-NEXT: addq $32, %rcx
+; SPR-NEXT: cmpq %rcx, %rax
+; SPR-NEXT: jne .LBB4_5
+; SPR-NEXT: .LBB4_6:
+; SPR-NEXT: retq
;
; AVX512-LABEL: foo_256:
; AVX512: # %bb.0:
@@ -433,9 +589,12 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) {
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1
; AVX512-NEXT: vpdpwssd -96(%rdi), %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd -64(%rdi), %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd -32(%rdi), %ymm1, %ymm0
-; AVX512-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0
+; AVX512-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2
+; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmaddwd -32(%rdi), %ymm1, %ymm2
+; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2
+; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX512-NEXT: addq $4, %rcx
; AVX512-NEXT: subq $-128, %rdi
; AVX512-NEXT: cmpq %rcx, %rdx
@@ -537,45 +696,86 @@ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
; }
; }
define void @bar_256(i32 %0, ptr %1, <4 x i64> %2, ptr %3) {
-; AVX-LABEL: bar_256:
-; AVX: # %bb.0:
-; AVX-NEXT: testl %edi, %edi
-; AVX-NEXT: jle .LBB5_5
-; AVX-NEXT: # %bb.1:
-; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: cmpl $1, %edi
-; AVX-NEXT: jne .LBB5_6
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: jmp .LBB5_3
-; AVX-NEXT: .LBB5_6:
-; AVX-NEXT: movl %eax, %edi
-; AVX-NEXT: andl $-2, %edi
-; AVX-NEXT: movl $32, %r8d
-; AVX-NEXT: xorl %ecx, %ecx
-; AVX-NEXT: .p2align 4, 0x90
-; AVX-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1
-; AVX-NEXT: vmovdqa -32(%rsi,%r8), %ymm1
-; AVX-NEXT: vmovdqa (%rsi,%r8), %ymm2
-; AVX-NEXT: {vex} vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1
-; AVX-NEXT: vmovdqa %ymm1, -32(%rsi,%r8)
-; AVX-NEXT: {vex} vpdpwssd (%rdx,%r8), %ymm0, %ymm2
-; AVX-NEXT: vmovdqa %ymm2, (%rsi,%r8)
-; AVX-NEXT: addq $2, %rcx
-; AVX-NEXT: addq $64, %r8
-; AVX-NEXT: cmpq %rcx, %rdi
-; AVX-NEXT: jne .LBB5_7
-; AVX-NEXT: .LBB5_3:
-; AVX-NEXT: testb $1, %al
-; AVX-NEXT: je .LBB5_5
-; AVX-NEXT: # %bb.4:
-; AVX-NEXT: shlq $5, %rcx
-; AVX-NEXT: vmovdqa (%rsi,%rcx), %ymm1
-; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %ymm0, %ymm1
-; AVX-NEXT: vmovdqa %ymm1, (%rsi,%rcx)
-; AVX-NEXT: .LBB5_5:
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; ADL-LABEL: bar_256:
+; ADL: # %bb.0:
+; ADL-NEXT: testl %edi, %edi
+; ADL-NEXT: jle .LBB5_5
+; ADL-NEXT: # %bb.1:
+; ADL-NEXT: movl %edi, %eax
+; ADL-NEXT: cmpl $1, %edi
+; ADL-NEXT: jne .LBB5_6
+; ADL-NEXT: # %bb.2:
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: jmp .LBB5_3
+; ADL-NEXT: .LBB5_6:
+; ADL-NEXT: movl %eax, %edi
+; ADL-NEXT: andl $-2, %edi
+; ADL-NEXT: movl $32, %r8d
+; ADL-NEXT: xorl %ecx, %ecx
+; ADL-NEXT: .p2align 4, 0x90
+; ADL-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1
+; ADL-NEXT: vmovdqa (%rsi,%r8), %ymm1
+; ADL-NEXT: vpmaddwd -32(%rdx,%r8), %ymm0, %ymm2
+; ADL-NEXT: vpaddd -32(%rsi,%r8), %ymm2, %ymm2
+; ADL-NEXT: vmovdqa %ymm2, -32(%rsi,%r8)
+; ADL-NEXT: {vex} vpdpwssd (%rdx,%r8), %ymm0, %ymm1
+; ADL-NEXT: vmovdqa %ymm1, (%rsi,%r8)
+; ADL-NEXT: addq $2, %rcx
+; ADL-NEXT: addq $64, %r8
+; ADL-NEXT: cmpq %rcx, %rdi
+; ADL-NEXT: jne .LBB5_7
+; ADL-NEXT: .LBB5_3:
+; ADL-NEXT: testb $1, %al
+; ADL-NEXT: je .LBB5_5
+; ADL-NEXT: # %bb.4:
+; ADL-NEXT: shlq $5, %rcx
+; ADL-NEXT: vmovdqa (%rsi,%rcx), %ymm1
+; ADL-NEXT: {vex} vpdpwssd (%rdx,%rcx), %ymm0, %ymm1
+; ADL-NEXT: vmovdqa %ymm1, (%rsi,%rcx)
+; ADL-NEXT: .LBB5_5:
+; ADL-NEXT: vzeroupper
+; ADL-NEXT: retq
+;
+; SPR-LABEL: bar_256:
+; SPR: # %bb.0:
+; SPR-NEXT: testl %edi, %edi
+; SPR-NEXT: jle .LBB5_5
+; SPR-NEXT: # %bb.1:
+; SPR-NEXT: movl %edi, %eax
+; SPR-NEXT: cmpl $1, %edi
+; SPR-NEXT: jne .LBB5_6
+; SPR-NEXT: # %bb.2:
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: jmp .LBB5_3
+; SPR-NEXT: .LBB5_6:
+; SPR-NEXT: movl %eax, %edi
+; SPR-NEXT: andl $-2, %edi
+; SPR-NEXT: movl $32, %r8d
+; SPR-NEXT: xorl %ecx, %ecx
+; SPR-NEXT: .p2align 4, 0x90
+; SPR-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1
+; SPR-NEXT: vmovdqa -32(%rsi,%r8), %ymm1
+; SPR-NEXT: vmovdqa (%rsi,%r8), %ymm2
+; SPR-NEXT: {vex} vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1
+; SPR-NEXT: vmovdqa %ymm1, -32(%rsi,%r8)
+; SPR-NEXT: vpmaddwd (%rdx,%r8), %ymm0, %ymm1
+; SPR-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; SPR-NEXT: vmovdqa %ymm1, (%rsi,%r8)
+; SPR-NEXT: addq $2, %rcx
+; SPR-NEXT: addq $64, %r8
+; SPR-NEXT: cmpq %rcx, %rdi
+; SPR-NEXT: jne .LBB5_7
+; SPR-NEXT: .LBB5_3:
+; SPR-NEXT: testb $1, %al
+; SPR-NEXT: je .LBB5_5
+; SPR-NEXT: # %bb.4:
+; SPR-NEXT: shlq $5, %rcx
+; SPR-NEXT: vpmaddwd (%rdx,%rcx), %ymm0, %ymm0
+; SPR-NEXT: vpaddd (%rsi,%rcx), %ymm0, %ymm0
+; SPR-NEXT: vmovdqa %ymm0, (%rsi,%rcx)
+; SPR-NEXT: .LBB5_5:
+; SPR-NEXT: vzeroupper
+; SPR-NEXT: retq
;
; AVX512-LABEL: bar_256:
; AVX512: # %bb.0:
@@ -599,8 +799,9 @@ define void @bar_256(i32 %0, ptr %1, <4 x i64> %2, ptr %3) {
; AVX512-NEXT: vmovdqa (%rsi,%r8), %ymm2
; AVX512-NEXT: vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1
; AVX512-NEXT: vmovdqa %ymm1, -32(%rsi,%r8)
-; AVX512-NEXT: vpdpwssd (%rdx,%r8), %ymm0, %ymm2
-; AVX512-NEXT: vmovdqa %ymm2, (%rsi,%r8)
+; AVX512-NEXT: vpmaddwd (%rdx,%r8), %ymm0, %ymm1
+; AVX512-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vmovdqa %ymm1, (%rsi,%r8)
; AVX512-NEXT: addq $2, %rcx
; AVX512-NEXT: addq $64, %r8
; AVX512-NEXT: cmpq %rcx, %rdi
@@ -610,9 +811,9 @@ define void @bar_256(i32 %0, ptr %1, <4 x i64> %2, ptr %3) {
; AVX512-NEXT: je .LBB5_5
; AVX512-NEXT: # %bb.4:
; AVX512-NEXT: shlq $5, %rcx
-; AVX512-NEXT: vmovdqa (%rsi,%rcx), %ymm1
-; AVX512-NEXT: vpdpwssd (%rdx,%rcx), %ymm0, %ymm1
-; AVX512-NEXT: vmovdqa %ymm1, (%rsi,%rcx)
+; AVX512-NEXT: vpmaddwd (%rdx,%rcx), %ymm0, %ymm0
+; AVX512-NEXT: vpaddd (%rsi,%rcx), %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa %ymm0, (%rsi,%rcx)
; AVX512-NEXT: .LBB5_5:
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
More information about the llvm-commits
mailing list