[llvm] [SystemZ] Enable MachineCombiner for FP reassociation (PR #83546)
Jonas Paulsson via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 30 07:58:59 PDT 2024
https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/83546
>From 834642ecc8b2859a9c1180c30d747b2a9f9f429f Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson at linux.vnet.ibm.com>
Date: Tue, 4 Apr 2023 22:09:43 +0200
Subject: [PATCH 1/9] Cleaned up after experiments. Remove RegMem parts
Machinecombiner tests
---
.../llvm/CodeGen/MachineCombinerPattern.h | 9 +
llvm/lib/CodeGen/MachineCombiner.cpp | 7 +
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 442 +++++++++++
llvm/lib/Target/SystemZ/SystemZInstrInfo.h | 33 +
.../Target/SystemZ/SystemZTargetMachine.cpp | 9 +
llvm/lib/Target/X86/X86InstrInfo.cpp | 2 +-
llvm/lib/Target/X86/X86InstrInfo.h | 2 +-
llvm/test/CodeGen/SystemZ/fp-add-02.ll | 14 +
llvm/test/CodeGen/SystemZ/fp-mul-02.ll | 12 +-
.../SystemZ/machine-combiner-reassoc-fp-01.ll | 690 ++++++++++++++++++
.../SystemZ/machine-combiner-reassoc-fp-03.ll | 91 +++
.../SystemZ/machine-combiner-reassoc-fp-04.ll | 123 ++++
.../SystemZ/machine-combiner-reassoc-fp-08.ll | 116 +++
.../SystemZ/machine-combiner-reassoc-fp-09.ll | 177 +++++
14 files changed, 1724 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll
create mode 100644 llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
create mode 100644 llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
create mode 100644 llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
create mode 100644 llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
index 3428c4dde5c7fc..b9d568f3e230ef 100644
--- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -34,6 +34,15 @@ enum MachineCombinerPattern : unsigned {
REASSOC_XA_YB,
TARGET_PATTERN_START
+ // SystemZ patterns. (EXPERIMENTAL)
+ FMA2_P1P0,
+ FMA2_P0P1,
+ FMA2,
+ FMA1_Add_L,
+ FMA1_Add_R,
+ FMA3, // These are inspired by PPC
+ FMA2_Add, //
+
};
} // end namespace llvm
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index c11263163a34ff..a1ccca790fca9a 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -306,6 +306,13 @@ CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) {
case MachineCombinerPattern::REASSOC_AX_YB:
case MachineCombinerPattern::REASSOC_XA_BY:
case MachineCombinerPattern::REASSOC_XA_YB:
+ case MachineCombinerPattern::FMA2_P1P0:
+ case MachineCombinerPattern::FMA2_P0P1:
+ case MachineCombinerPattern::FMA2:
+ case MachineCombinerPattern::FMA1_Add_L:
+ case MachineCombinerPattern::FMA1_Add_R:
+ case MachineCombinerPattern::FMA3:
+ case MachineCombinerPattern::FMA2_Add:
return CombinerObjective::MustReduceDepth;
default:
return TII->getCombinerObjective(Pattern);
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 6b75c30943b40a..7f850a43a31f0f 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -1004,6 +1005,447 @@ SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
return nullptr;
}
+static bool hasReassocFlags(const MachineInstr *MI) {
+ return (MI->getFlag(MachineInstr::MIFlag::FmReassoc) &&
+ MI->getFlag(MachineInstr::MIFlag::FmNsz));
+}
+
+bool SystemZInstrInfo::IsReassociableFMA(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ case SystemZ::VFMADB:
+ case SystemZ::VFMASB:
+ case SystemZ::WFMAXB:
+ case SystemZ::WFMADB:
+ case SystemZ::WFMASB:
+ return hasReassocFlags(MI);
+ default:
+ break;
+ }
+ return false;
+}
+
+bool SystemZInstrInfo::IsReassociableAdd(const MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ case SystemZ::VFADB:
+ case SystemZ::VFASB:
+ case SystemZ::WFAXB:
+ return hasReassocFlags(MI);
+ case SystemZ::WFADB_CCPseudo:
+ case SystemZ::WFASB_CCPseudo:
+ return hasReassocFlags(MI) &&
+ MI->findRegisterDefOperandIdx(SystemZ::CC, true/*isDead*/) != -1;
+ default:
+ break;
+ }
+ return false;
+}
+
+// EXPERIMENTAL
+static cl::opt<bool> Z_FMA("z-fma", cl::init(false));
+static cl::opt<bool> PPC_FMA("ppc-fma", cl::init(false));
+
+bool SystemZInstrInfo::getFMAPatterns(
+ MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+ bool DoRegPressureReduce) const {
+ assert(Patterns.empty());
+ MachineBasicBlock *MBB = Root.getParent();
+ const MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo();
+
+ if (!IsReassociableFMA(&Root))
+ return false;
+
+ const TargetRegisterClass *RC = MRI->getRegClass(Root.getOperand(0).getReg());
+
+ // This is more or less always true.
+ auto AllOpsOK = [&MRI, &RC](const MachineInstr &Instr) {
+ for (const auto &MO : Instr.explicit_operands())
+ if (!(MO.isReg() && MO.getReg().isVirtual() && !MO.getSubReg()))
+ return false;
+ const TargetRegisterClass *DefRC = MRI->getRegClass(Instr.getOperand(0).getReg());
+ if (!DefRC->hasSubClassEq(RC) && !DefRC->hasSuperClassEq(RC))
+ return false;
+ return true;
+ };
+ if (!AllOpsOK(Root))
+ return false;
+
+ MachineInstr *TopAdd = nullptr;
+ std::vector<MachineInstr *> FMAChain;
+ FMAChain.push_back(&Root);
+ Register Acc = Root.getOperand(3).getReg();
+ while (MachineInstr *Prev = MRI->getUniqueVRegDef(Acc)) {
+ if (Prev->getParent() != MBB || !MRI->hasOneNonDBGUse(Acc) ||
+ !AllOpsOK(*Prev))
+ break;
+ if (IsReassociableFMA(Prev)) {
+ FMAChain.push_back(Prev);
+ Acc = Prev->getOperand(3).getReg();
+ continue;
+ }
+ if (IsReassociableAdd(Prev))
+ TopAdd = Prev;
+ break;
+ }
+
+ if (Z_FMA) {
+ if (FMAChain.size() >= 2) {
+ Patterns.push_back(MachineCombinerPattern::FMA2_P1P0);
+ LLVM_DEBUG(dbgs() << "add pattern FMA2_P1P0\n");
+ Patterns.push_back(MachineCombinerPattern::FMA2_P0P1);
+ LLVM_DEBUG(dbgs() << "add pattern FMA2_P0P1\n");
+ Patterns.push_back(MachineCombinerPattern::FMA2);
+ LLVM_DEBUG(dbgs() << "add pattern FMA2\n");
+ }
+ if (FMAChain.size() == 1 && TopAdd) {
+ // The latency of the FMA could potentially be hidden above the add:
+ // Try both sides of the add and let MachineCombiner decide on
+ // profitability.
+ Patterns.push_back(MachineCombinerPattern::FMA1_Add_L);
+ LLVM_DEBUG(dbgs() << "add pattern FMA1_Add_L\n");
+ Patterns.push_back(MachineCombinerPattern::FMA1_Add_R);
+ LLVM_DEBUG(dbgs() << "add pattern FMA1_Add_R\n");
+ }
+ } else if (PPC_FMA) {
+ if (FMAChain.size() >= 3) {
+ Patterns.push_back(MachineCombinerPattern::FMA3);
+ LLVM_DEBUG(dbgs() << "add pattern FMA3\n");
+ }
+ if (FMAChain.size() == 2 && TopAdd) {
+ Patterns.push_back(MachineCombinerPattern::FMA2_Add);
+ LLVM_DEBUG(dbgs() << "add pattern FMA2_Add\n");
+ }
+ }
+
+ return Patterns.size() > 0;
+}
+
+bool SystemZInstrInfo::getMachineCombinerPatterns(
+ MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+ bool DoRegPressureReduce) const {
+
+ if (getFMAPatterns(Root, Patterns, DoRegPressureReduce))
+ return true;
+
+ return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+ DoRegPressureReduce);
+}
+
+void SystemZInstrInfo::finalizeInsInstrs(
+ MachineInstr &Root, MachineCombinerPattern &P,
+ SmallVectorImpl<MachineInstr *> &InsInstrs) const {
+ const TargetRegisterInfo *TRI =
+ Root.getParent()->getParent()->getSubtarget().getRegisterInfo();
+ for (auto *Inst : InsInstrs) {
+ switch (Inst->getOpcode()) {
+ case SystemZ::WFADB_CCPseudo:
+ case SystemZ::WFASB_CCPseudo:
+ case SystemZ::WFSDB_CCPseudo:
+ case SystemZ::WFSSB_CCPseudo:
+ Inst->addRegisterDead(SystemZ::CC, TRI);
+ break;
+ default: break;
+ }
+ }
+}
+
+bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
+ bool Invert) const {
+ unsigned Opc = Inst.getOpcode();
+ if (Invert) {
+ auto InverseOpcode = getInverseOpcode(Opc);
+ if (!InverseOpcode)
+ return false;
+ Opc = *InverseOpcode;
+ }
+
+ switch (Opc) {
+ default:
+ break;
+ // Adds and multiplications.
+ case SystemZ::VFADB:
+ case SystemZ::VFASB:
+ case SystemZ::WFAXB:
+ case SystemZ::WFADB_CCPseudo:
+ case SystemZ::WFASB_CCPseudo:
+ case SystemZ::VFMDB:
+ case SystemZ::VFMSB:
+ case SystemZ::WFMXB:
+ case SystemZ::WFMDB:
+ case SystemZ::WFMSB:
+ return hasReassocFlags(&Inst);
+ }
+
+ return false;
+}
+
+std::optional<unsigned>
+SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const {
+ // fadd <=> fsub in various forms.
+ switch (Opcode) {
+ case SystemZ::VFADB: return SystemZ::VFSDB;
+ case SystemZ::VFASB: return SystemZ::VFSSB;
+ case SystemZ::WFAXB: return SystemZ::WFSXB;
+ case SystemZ::WFADB_CCPseudo: return SystemZ::WFSDB_CCPseudo;
+ case SystemZ::WFASB_CCPseudo: return SystemZ::WFSSB_CCPseudo;
+ case SystemZ::VFSDB: return SystemZ::VFADB;
+ case SystemZ::VFSSB: return SystemZ::VFASB;
+ case SystemZ::WFSXB: return SystemZ::WFAXB;
+ case SystemZ::WFSDB_CCPseudo: return SystemZ::WFADB_CCPseudo;
+ case SystemZ::WFSSB_CCPseudo: return SystemZ::WFASB_CCPseudo;
+ default: return std::nullopt;
+ }
+}
+
+void SystemZInstrInfo::genAlternativeCodeSequence(
+ MachineInstr &Root, MachineCombinerPattern Pattern,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+ switch (Pattern) {
+ case MachineCombinerPattern::FMA2_P1P0:
+ case MachineCombinerPattern::FMA2_P0P1:
+ case MachineCombinerPattern::FMA2:
+ case MachineCombinerPattern::FMA1_Add_L:
+ case MachineCombinerPattern::FMA1_Add_R:
+ case MachineCombinerPattern::FMA3:
+ case MachineCombinerPattern::FMA2_Add:
+ reassociateFMA(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg);
+ break;
+ default:
+ // Reassociate default patterns.
+ TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
+ DelInstrs, InstrIdxForVirtReg);
+ break;
+ }
+}
+
+static void getSplitFMAOpcodes(unsigned FMAOpc, unsigned &AddOpc,
+ unsigned &MulOpc) {
+ switch (FMAOpc) {
+ case SystemZ::VFMADB: AddOpc = SystemZ::VFADB; MulOpc = SystemZ::VFMDB; break;
+ case SystemZ::VFMASB: AddOpc = SystemZ::VFASB; MulOpc = SystemZ::VFMSB; break;
+ case SystemZ::WFMAXB: AddOpc = SystemZ::WFAXB; MulOpc = SystemZ::WFMXB; break;
+ case SystemZ::WFMADB:
+ AddOpc = SystemZ::WFADB_CCPseudo; MulOpc = SystemZ::WFMDB; break;
+ case SystemZ::WFMASB:
+ AddOpc = SystemZ::WFASB_CCPseudo; MulOpc = SystemZ::WFMSB; break;
+ default:
+ llvm_unreachable("Expected FMA opcode.");
+ }
+}
+
+void SystemZInstrInfo::reassociateFMA(
+ MachineInstr &Root, MachineCombinerPattern Pattern,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+ MachineFunction *MF = Root.getMF();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ const TargetRegisterClass *RC = Root.getRegClassConstraint(0, this, TRI);
+ Register DstReg = Root.getOperand(0).getReg();
+ std::vector<MachineInstr *> Chain; // XXXXX
+ Chain.push_back(&Root);
+
+ uint16_t IntersectedFlags = Root.getFlags();
+ auto getIntersectedFlags = [&]() {
+ for (auto *MI : Chain)
+ IntersectedFlags &= MI->getFlags();
+ };
+
+ auto createNewVReg = [&](unsigned NewInsIdx) -> Register {
+ Register NewReg = MRI.createVirtualRegister(RC);
+ InstrIdxForVirtReg.insert(std::make_pair(NewReg, NewInsIdx));
+ return NewReg;
+ };
+
+ auto finalizeNewMIs = [&](ArrayRef<MachineInstr *> NewMIs) {
+ for (auto *MI : NewMIs) {
+ setSpecialOperandAttr(*MI, IntersectedFlags);
+ MI->addRegisterDead(SystemZ::CC, TRI);
+ InsInstrs.push_back(MI);
+ }
+ };
+
+ auto deleteOld = [&InsInstrs, &DelInstrs, &Chain]() {
+ assert(!InsInstrs.empty() &&
+ "Insertion instructions set should not be empty!");
+ // Record old instructions for deletion.
+ for (auto *MI : make_range(Chain.rbegin(), Chain.rend()))
+ DelInstrs.push_back(MI);
+ };
+
+ assert(IsReassociableFMA(&Root));
+ unsigned FMAOpc = Root.getOpcode();
+ unsigned AddOpc, MulOpc;
+ getSplitFMAOpcodes(FMAOpc, AddOpc, MulOpc);
+
+#ifndef NDEBUG
+ auto IsAllFMA = [&Chain, &FMAOpc]() {
+ for (auto *MI : Chain)
+ if (MI->getOpcode() != FMAOpc)
+ return false;
+ return true;
+ };
+#endif
+
+ switch (Pattern) {
+ case MachineCombinerPattern::FMA2_P1P0:
+ case MachineCombinerPattern::FMA2_P0P1: {
+ if (Pattern == MachineCombinerPattern::FMA2_P1P0)
+ LLVM_DEBUG(dbgs() << "reassociating using pattern FMA_P1P0\n");
+ else
+ LLVM_DEBUG(dbgs() << "reassociating using pattern FMA_P0P1\n");
+ Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
+ assert(IsAllFMA());
+ getIntersectedFlags(); // XXXXXXXXXx
+ Register NewVRA = createNewVReg(0);
+ Register NewVRB = createNewVReg(1);
+ unsigned FirstMulIdx =
+ Pattern == MachineCombinerPattern::FMA2_P1P0 ? 1 : 0;
+ unsigned SecondMulIdx = FirstMulIdx == 0 ? 1 : 0;
+ MachineInstr *MINewA =
+ BuildMI(*MF, Chain[FirstMulIdx]->getDebugLoc(), get(MulOpc), NewVRA)
+ .add(Chain[FirstMulIdx]->getOperand(1))
+ .add(Chain[FirstMulIdx]->getOperand(2));
+ MachineInstr *MINewB =
+ BuildMI(*MF, Chain[SecondMulIdx]->getDebugLoc(), get(FMAOpc), NewVRB)
+ .add(Chain[SecondMulIdx]->getOperand(1))
+ .add(Chain[SecondMulIdx]->getOperand(2))
+ .addReg(NewVRA);
+ MachineInstr *MINewC =
+ BuildMI(*MF, Chain[1]->getDebugLoc(), get(AddOpc), DstReg)
+ .add(Chain[1]->getOperand(3))
+ .addReg(NewVRB);
+ finalizeNewMIs({MINewA, MINewB, MINewC});
+ break;
+ }
+ case MachineCombinerPattern::FMA2: {
+ LLVM_DEBUG(dbgs() << "reassociating using pattern FMA2\n");
+ Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
+ assert(IsAllFMA());
+ getIntersectedFlags();
+ Register NewVRA = createNewVReg(0);
+ MachineInstr *MINewA =
+ BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRA)
+ .add(Chain[0]->getOperand(1))
+ .add(Chain[0]->getOperand(2))
+ .add(Chain[1]->getOperand(3));
+ MachineInstr *MINewB =
+ BuildMI(*MF, Chain[1]->getDebugLoc(), get(FMAOpc), DstReg)
+ .add(Chain[1]->getOperand(1))
+ .add(Chain[1]->getOperand(2))
+ .addReg(NewVRA);
+ finalizeNewMIs({MINewA, MINewB});
+ break;
+ }
+ case MachineCombinerPattern::FMA1_Add_L:
+ case MachineCombinerPattern::FMA1_Add_R: {
+ if (Pattern == MachineCombinerPattern::FMA1_Add_L)
+ LLVM_DEBUG(dbgs() << "reassociating using pattern FMA1_Add_L\n");
+ else
+ LLVM_DEBUG(dbgs() << "reassociating using pattern FMA1_Add_R\n");
+ assert(IsAllFMA());
+ Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
+ assert(Chain.back()->getOpcode() == AddOpc && "Expected matching Add");
+ getIntersectedFlags();
+ unsigned Op = Pattern == MachineCombinerPattern::FMA1_Add_L ? 1 : 2;
+ unsigned OtherOp = Op == 1 ? 2 : 1;
+ Register NewVRA = createNewVReg(0);
+ MachineInstr *MINewA =
+ BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRA)
+ .add(Chain[0]->getOperand(1))
+ .add(Chain[0]->getOperand(2))
+ .add(Chain[1]->getOperand(Op));
+ MachineInstr *MINewB =
+ BuildMI(*MF, Chain[1]->getDebugLoc(), get(AddOpc), DstReg)
+ .addReg(NewVRA)
+ .add(Chain[1]->getOperand(OtherOp));
+ finalizeNewMIs({MINewA, MINewB});
+ break;
+ }
+ case MachineCombinerPattern::FMA3: {
+ LLVM_DEBUG(dbgs() << "reassociating using pattern FMA3\n");
+ Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
+ Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
+ assert(IsAllFMA());
+ getIntersectedFlags();
+ Register NewVRA = createNewVReg(0);
+ Register NewVRB = createNewVReg(1);
+ Register NewVRC = createNewVReg(2);
+ MachineInstr *MINewA =
+ BuildMI(*MF, Chain[2]->getDebugLoc(), get(MulOpc), NewVRA)
+ .add(Chain[2]->getOperand(1))
+ .add(Chain[2]->getOperand(2));
+ MachineInstr *MINewB =
+ BuildMI(*MF, Chain[1]->getDebugLoc(), get(FMAOpc), NewVRB)
+ .add(Chain[1]->getOperand(1))
+ .add(Chain[1]->getOperand(2))
+ .add(Chain[2]->getOperand(3));
+ MachineInstr *MINewC =
+ BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRC)
+ .add(Chain[0]->getOperand(1))
+ .add(Chain[0]->getOperand(2))
+ .addReg(NewVRA);
+ MachineInstr *MINewD =
+ BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), DstReg)
+ .addReg(NewVRB)
+ .addReg(NewVRC);
+ finalizeNewMIs({MINewA, MINewB, MINewC, MINewD});
+ break;
+ }
+ case MachineCombinerPattern::FMA2_Add: {
+ LLVM_DEBUG(dbgs() << "reassociating using pattern FMA2_Add\n");
+ Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
+ assert(IsAllFMA());
+ Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
+ assert(Chain.back()->getOpcode() == AddOpc && "Expected matching Add");
+ getIntersectedFlags();
+ Register NewVRA = createNewVReg(0);
+ Register NewVRB = createNewVReg(1);
+ MachineInstr *MINewA =
+ BuildMI(*MF, Chain[1]->getDebugLoc(), get(FMAOpc), NewVRA)
+ .add(Chain[1]->getOperand(1))
+ .add(Chain[1]->getOperand(2))
+ .add(Chain[2]->getOperand(1));
+ MachineInstr *MINewB =
+ BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRB)
+ .add(Chain[0]->getOperand(1))
+ .add(Chain[0]->getOperand(2))
+ .add(Chain[2]->getOperand(2));
+ MachineInstr *MINewC =
+ BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), DstReg)
+ .addReg(NewVRA)
+ .addReg(NewVRB);
+ finalizeNewMIs({MINewA, MINewB, MINewC});
+ break;
+ }
+ default:
+ llvm_unreachable("not recognized pattern!");
+ }
+
+ deleteOld();
+}
+
+bool
+SystemZInstrInfo::accumulateInstrSeqToRootLatency(MachineInstr &Root) const {
+ // This doesn't make much sense for FMA patterns as they typically use an
+ // extra Add to do things in parallell.
+ if (IsReassociableFMA(&Root)) // XXXXXXXXXXXX
+ return false;
+
+ return true;
+}
+
+void SystemZInstrInfo::setSpecialOperandAttr(MachineInstr &MI,
+ uint32_t Flags) const {
+ MI.setFlags(Flags);
+ MI.clearFlag(MachineInstr::MIFlag::NoSWrap);
+ MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
+ MI.clearFlag(MachineInstr::MIFlag::IsExact);
+}
+
MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index cdf07310108a96..09ffcb6690502e 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -256,6 +256,7 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
Register FalseReg) const override;
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
MachineRegisterInfo *MRI) const override;
+
bool isPredicable(const MachineInstr &MI) const override;
bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
unsigned ExtraPredCycles,
@@ -285,6 +286,38 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
Register VReg) const override;
MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
LiveIntervals *LIS) const override;
+
+ bool useMachineCombiner() const override { return true; }
+ bool IsReassociableFMA(const MachineInstr *MI) const;
+ bool IsReassociableAdd(const MachineInstr *MI) const;
+ bool getFMAPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &P,
+ bool DoRegPressureReduce) const;
+ bool getMachineCombinerPatterns(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern> &P,
+ bool DoRegPressureReduce) const override;
+ void
+ finalizeInsInstrs(MachineInstr &Root, MachineCombinerPattern &P,
+ SmallVectorImpl<MachineInstr *> &InsInstrs) const override;
+ bool isAssociativeAndCommutative(const MachineInstr &Inst,
+ bool Invert) const override;
+ std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;
+ void genAlternativeCodeSequence(
+ MachineInstr &Root, MachineCombinerPattern Pattern,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const override;
+ void reassociateFMA(
+ MachineInstr &Root, MachineCombinerPattern Pattern,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
+ bool accumulateInstrSeqToRootLatency(MachineInstr &Root) const override;
+ int getExtendResourceLenLimit() const override { return 0; } //XXX
+ // SystemZ specific version of setSpecialOperandAttr that copies Flags to
+ // MI and clears nuw, nsw, and exact flags.
+ void setSpecialOperandAttr(MachineInstr &MI, uint32_t Flags) const;
+
MachineInstr *
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 2491bd2ee2c12c..2436df0c8fb0a2 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -30,6 +30,11 @@
using namespace llvm;
+static cl::opt<bool>
+EnableMachineCombinerPass("systemz-machine-combiner",
+ cl::desc("Enable the machine combiner pass"),
+ cl::init(true), cl::Hidden);
+
// NOLINTNEXTLINE(readability-identifier-naming)
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
// Register the target.
@@ -245,6 +250,10 @@ bool SystemZPassConfig::addInstSelector() {
bool SystemZPassConfig::addILPOpts() {
addPass(&EarlyIfConverterID);
+
+ if (EnableMachineCombinerPass)
+ addPass(&MachineCombinerID);
+
return true;
}
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 3d80c43b571f9c..07b79572cfd698 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5499,7 +5499,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
/// register, the virtual register is used once in the same BB, and the
/// instructions in-between do not load or store, and have no side effects.
MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
- const MachineRegisterInfo *MRI,
+ MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
// Check whether we can move DefMI here.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 5407ede69a91ca..a6cae4622d922f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -559,7 +559,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
const MachineRegisterInfo *MRI) const override;
MachineInstr *optimizeLoadInstr(MachineInstr &MI,
- const MachineRegisterInfo *MRI,
+ MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const override;
diff --git a/llvm/test/CodeGen/SystemZ/fp-add-02.ll b/llvm/test/CodeGen/SystemZ/fp-add-02.ll
index bb12196fb848a5..8f65161b5bae83 100644
--- a/llvm/test/CodeGen/SystemZ/fp-add-02.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-add-02.ll
@@ -118,3 +118,17 @@ define double @f7(ptr %ptr0) {
ret double %add10
}
+
+; Check that reassociation flags do not get in the way of adb.
+define double @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: ld %f0
+; CHECK: adb %f0
+; CHECK: br %r14
+entry:
+ %0 = load double, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %1 = load double, ptr %arrayidx1, align 8
+ %add = fadd reassoc nsz arcp contract afn double %1, %0
+ ret double %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-02.ll b/llvm/test/CodeGen/SystemZ/fp-mul-02.ll
index 5a99537493cd19..1ac4bbec352d1c 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-02.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-02.ll
@@ -1,6 +1,6 @@
; Test multiplication of two f32s, producing an f64 result.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s
declare float @foo()
@@ -201,3 +201,13 @@ define float @f7(ptr %ptr0) {
ret float %trunc9
}
+
+; Check that reassociation flags do not get in the way of mdebr.
+define double @f8(float %Src) {
+; CHECK-LABEL: f8:
+; CHECK: mdebr %f0, %f0
+; CHECK: br %r14
+ %D = fpext float %Src to double
+ %res = fmul reassoc nsz arcp contract afn double %D, %D
+ ret double %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll
new file mode 100644
index 00000000000000..72303a47dc7386
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll
@@ -0,0 +1,690 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
+; RUN: | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -stop-before=processimpdefs \
+; RUN: -O3 | FileCheck %s --check-prefix=PASSOUTPUT
+
+; Test reassociation of fp add, subtract and multiply.
+
+define double @fun0_fadd(ptr %x) {
+; CHECK-LABEL: fun0_fadd:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ld %f0, 0(%r2)
+; CHECK-NEXT: adb %f0, 8(%r2)
+; CHECK-NEXT: ld %f1, 24(%r2)
+; CHECK-NEXT: adb %f1, 16(%r2)
+; CHECK-NEXT: adbr %f0, %f1
+; CHECK-NEXT: ld %f1, 40(%r2)
+; CHECK-NEXT: adb %f1, 32(%r2)
+; CHECK-NEXT: adb %f1, 48(%r2)
+; CHECK-NEXT: adbr %f0, %f1
+; CHECK-NEXT: adb %f0, 56(%r2)
+; CHECK-NEXT: br %r14
+
+; PASSOUTPUT: name: fun0_fadd
+; PASSOUTPUT-NOT: WFADB
+; PASSOUTPUT: WFADB killed %3, killed %18, implicit $fpc
+; PASSOUTPUT-NOT: WFADB {{.*}}$cc
+; PASSOUTPUT-NOT: WFADB_CCPseudo
+entry:
+ %0 = load double, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %1 = load double, ptr %arrayidx1, align 8
+ %add = fadd reassoc nsz arcp contract afn double %1, %0
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %2 = load double, ptr %arrayidx2, align 8
+ %add3 = fadd reassoc nsz arcp contract afn double %add, %2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+ %3 = load double, ptr %arrayidx4, align 8
+ %add5 = fadd reassoc nsz arcp contract afn double %add3, %3
+ %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4
+ %4 = load double, ptr %arrayidx6, align 8
+ %add7 = fadd reassoc nsz arcp contract afn double %add5, %4
+ %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5
+ %5 = load double, ptr %arrayidx8, align 8
+ %add9 = fadd reassoc nsz arcp contract afn double %add7, %5
+ %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6
+ %6 = load double, ptr %arrayidx10, align 8
+ %add11 = fadd reassoc nsz arcp contract afn double %add9, %6
+ %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7
+ %7 = load double, ptr %arrayidx12, align 8
+ %add13 = fadd reassoc nsz arcp contract afn double %add11, %7
+ ret double %add13
+}
+
+define float @fun1_fadd(ptr %x) {
+; CHECK-LABEL: fun1_fadd:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lde %f0, 0(%r2)
+; CHECK-NEXT: aeb %f0, 4(%r2)
+; CHECK-NEXT: lde %f1, 12(%r2)
+; CHECK-NEXT: aeb %f1, 8(%r2)
+; CHECK-NEXT: aebr %f0, %f1
+; CHECK-NEXT: lde %f1, 20(%r2)
+; CHECK-NEXT: aeb %f1, 16(%r2)
+; CHECK-NEXT: aeb %f1, 24(%r2)
+; CHECK-NEXT: aebr %f0, %f1
+; CHECK-NEXT: aeb %f0, 28(%r2)
+; CHECK-NEXT: br %r14
+
+; PASSOUTPUT: name: fun1_fadd
+; PASSOUTPUT-NOT: WFASB
+; PASSOUTPUT: WFASB killed %3, killed %18, implicit $fpc
+; PASSOUTPUT-NOT: WFASB {{.*}}$cc
+; PASSOUTPUT-NOT: WFASB_CCPseudo
+entry:
+ %0 = load float, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+ %1 = load float, ptr %arrayidx1, align 8
+ %add = fadd reassoc nsz arcp contract afn float %1, %0
+ %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2
+ %2 = load float, ptr %arrayidx2, align 8
+ %add3 = fadd reassoc nsz arcp contract afn float %add, %2
+ %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3
+ %3 = load float, ptr %arrayidx4, align 8
+ %add5 = fadd reassoc nsz arcp contract afn float %add3, %3
+ %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4
+ %4 = load float, ptr %arrayidx6, align 8
+ %add7 = fadd reassoc nsz arcp contract afn float %add5, %4
+ %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5
+ %5 = load float, ptr %arrayidx8, align 8
+ %add9 = fadd reassoc nsz arcp contract afn float %add7, %5
+ %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6
+ %6 = load float, ptr %arrayidx10, align 8
+ %add11 = fadd reassoc nsz arcp contract afn float %add9, %6
+ %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7
+ %7 = load float, ptr %arrayidx12, align 8
+ %add13 = fadd reassoc nsz arcp contract afn float %add11, %7
+ ret float %add13
+}
+
+define fp128 @fun2_fadd(ptr %x) {
+; CHECK-LABEL: fun2_fadd:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: wfaxb %v0, %v1, %v0
+; CHECK-NEXT: vl %v1, 32(%r3), 3
+; CHECK-NEXT: vl %v2, 48(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfaxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r3), 3
+; CHECK-NEXT: vl %v2, 80(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfaxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r3), 3
+; CHECK-NEXT: wfaxb %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load fp128, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1
+ %1 = load fp128, ptr %arrayidx1, align 8
+ %add = fadd reassoc nsz arcp contract afn fp128 %1, %0
+ %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2
+ %2 = load fp128, ptr %arrayidx2, align 8
+ %add3 = fadd reassoc nsz arcp contract afn fp128 %add, %2
+ %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3
+ %3 = load fp128, ptr %arrayidx4, align 8
+ %add5 = fadd reassoc nsz arcp contract afn fp128 %add3, %3
+ %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4
+ %4 = load fp128, ptr %arrayidx6, align 8
+ %add7 = fadd reassoc nsz arcp contract afn fp128 %add5, %4
+ %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5
+ %5 = load fp128, ptr %arrayidx8, align 8
+ %add9 = fadd reassoc nsz arcp contract afn fp128 %add7, %5
+ %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6
+ %6 = load fp128, ptr %arrayidx10, align 8
+ %add11 = fadd reassoc nsz arcp contract afn fp128 %add9, %6
+ %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7
+ %7 = load fp128, ptr %arrayidx12, align 8
+ %add13 = fadd reassoc nsz arcp contract afn fp128 %add11, %7
+ ret fp128 %add13
+}
+
+define <2 x double> @fun3_fadd(ptr %x) {
+; CHECK-LABEL: fun3_fadd:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfadb %v0, %v1, %v0
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfadb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfadb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfadb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load <2 x double>, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1
+ %1 = load <2 x double>, ptr %arrayidx1, align 8
+ %add = fadd reassoc nsz arcp contract afn <2 x double> %1, %0
+ %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2
+ %2 = load <2 x double>, ptr %arrayidx2, align 8
+ %add3 = fadd reassoc nsz arcp contract afn <2 x double> %add, %2
+ %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3
+ %3 = load <2 x double>, ptr %arrayidx4, align 8
+ %add5 = fadd reassoc nsz arcp contract afn <2 x double> %add3, %3
+ %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4
+ %4 = load <2 x double>, ptr %arrayidx6, align 8
+ %add7 = fadd reassoc nsz arcp contract afn <2 x double> %add5, %4
+ %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5
+ %5 = load <2 x double>, ptr %arrayidx8, align 8
+ %add9 = fadd reassoc nsz arcp contract afn <2 x double> %add7, %5
+ %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6
+ %6 = load <2 x double>, ptr %arrayidx10, align 8
+ %add11 = fadd reassoc nsz arcp contract afn <2 x double> %add9, %6
+ %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7
+ %7 = load <2 x double>, ptr %arrayidx12, align 8
+ %add13 = fadd reassoc nsz arcp contract afn <2 x double> %add11, %7
+ ret <2 x double> %add13
+}
+
+define <4 x float> @fun4_fadd(ptr %x) {
+; CHECK-LABEL: fun4_fadd:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfasb %v0, %v1, %v0
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfasb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfasb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfasb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load <4 x float>, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1
+ %1 = load <4 x float>, ptr %arrayidx1, align 8
+ %add = fadd reassoc nsz arcp contract afn <4 x float> %1, %0
+ %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2
+ %2 = load <4 x float>, ptr %arrayidx2, align 8
+ %add3 = fadd reassoc nsz arcp contract afn <4 x float> %add, %2
+ %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3
+ %3 = load <4 x float>, ptr %arrayidx4, align 8
+ %add5 = fadd reassoc nsz arcp contract afn <4 x float> %add3, %3
+ %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4
+ %4 = load <4 x float>, ptr %arrayidx6, align 8
+ %add7 = fadd reassoc nsz arcp contract afn <4 x float> %add5, %4
+ %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5
+ %5 = load <4 x float>, ptr %arrayidx8, align 8
+ %add9 = fadd reassoc nsz arcp contract afn <4 x float> %add7, %5
+ %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6
+ %6 = load <4 x float>, ptr %arrayidx10, align 8
+ %add11 = fadd reassoc nsz arcp contract afn <4 x float> %add9, %6
+ %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7
+ %7 = load <4 x float>, ptr %arrayidx12, align 8
+ %add13 = fadd reassoc nsz arcp contract afn <4 x float> %add11, %7
+ ret <4 x float> %add13
+}
+
+define double @fun5_fsub(ptr %x) {
+; CHECK-LABEL: fun5_fsub:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ld %f0, 0(%r2)
+; CHECK-NEXT: sdb %f0, 8(%r2)
+; CHECK-NEXT: ld %f1, 24(%r2)
+; CHECK-NEXT: adb %f1, 16(%r2)
+; CHECK-NEXT: sdbr %f0, %f1
+; CHECK-NEXT: ld %f1, 40(%r2)
+; CHECK-NEXT: adb %f1, 32(%r2)
+; CHECK-NEXT: adb %f1, 48(%r2)
+; CHECK-NEXT: sdbr %f0, %f1
+; CHECK-NEXT: sdb %f0, 56(%r2)
+; CHECK-NEXT: br %r14
+
+; PASSOUTPUT: name: fun5_fsub
+; PASSOUTPUT-NOT: WFSDB
+; PASSOUTPUT: WFSDB killed %3, killed %18, implicit $fpc
+; PASSOUTPUT-NOT: WFSDB {{.*}}$cc
+; PASSOUTPUT-NOT: WFSDB_CCPseudo
+entry:
+ %0 = load double, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %1 = load double, ptr %arrayidx1, align 8
+ %sub = fsub reassoc nsz arcp contract afn double %0, %1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %2 = load double, ptr %arrayidx2, align 8
+ %sub3 = fsub reassoc nsz arcp contract afn double %sub, %2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+ %3 = load double, ptr %arrayidx4, align 8
+ %sub5 = fsub reassoc nsz arcp contract afn double %sub3, %3
+ %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4
+ %4 = load double, ptr %arrayidx6, align 8
+ %sub7 = fsub reassoc nsz arcp contract afn double %sub5, %4
+ %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5
+ %5 = load double, ptr %arrayidx8, align 8
+ %sub9 = fsub reassoc nsz arcp contract afn double %sub7, %5
+ %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6
+ %6 = load double, ptr %arrayidx10, align 8
+ %sub11 = fsub reassoc nsz arcp contract afn double %sub9, %6
+ %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7
+ %7 = load double, ptr %arrayidx12, align 8
+ %sub13 = fsub reassoc nsz arcp contract afn double %sub11, %7
+ ret double %sub13
+}
+
+define float @fun6_fsub(ptr %x) {
+; CHECK-LABEL: fun6_fsub:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lde %f0, 0(%r2)
+; CHECK-NEXT: seb %f0, 4(%r2)
+; CHECK-NEXT: lde %f1, 12(%r2)
+; CHECK-NEXT: aeb %f1, 8(%r2)
+; CHECK-NEXT: sebr %f0, %f1
+; CHECK-NEXT: lde %f1, 20(%r2)
+; CHECK-NEXT: aeb %f1, 16(%r2)
+; CHECK-NEXT: aeb %f1, 24(%r2)
+; CHECK-NEXT: sebr %f0, %f1
+; CHECK-NEXT: seb %f0, 28(%r2)
+; CHECK-NEXT: br %r14
+
+; PASSOUTPUT: name: fun6_fsub
+; PASSOUTPUT-NOT: WFSSB
+; PASSOUTPUT: WFSSB killed %3, killed %18, implicit $fpc
+; PASSOUTPUT-NOT: WFSSB {{.*}}$cc
+; PASSOUTPUT-NOT: WFSSB_CCPseudo
+entry:
+ %0 = load float, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+ %1 = load float, ptr %arrayidx1, align 8
+ %sub = fsub reassoc nsz arcp contract afn float %0, %1
+ %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2
+ %2 = load float, ptr %arrayidx2, align 8
+ %sub3 = fsub reassoc nsz arcp contract afn float %sub, %2
+ %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3
+ %3 = load float, ptr %arrayidx4, align 8
+ %sub5 = fsub reassoc nsz arcp contract afn float %sub3, %3
+ %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4
+ %4 = load float, ptr %arrayidx6, align 8
+ %sub7 = fsub reassoc nsz arcp contract afn float %sub5, %4
+ %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5
+ %5 = load float, ptr %arrayidx8, align 8
+ %sub9 = fsub reassoc nsz arcp contract afn float %sub7, %5
+ %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6
+ %6 = load float, ptr %arrayidx10, align 8
+ %sub11 = fsub reassoc nsz arcp contract afn float %sub9, %6
+ %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7
+ %7 = load float, ptr %arrayidx12, align 8
+ %sub13 = fsub reassoc nsz arcp contract afn float %sub11, %7
+ ret float %sub13
+}
+
+define fp128 @fun7_fsub(ptr %x) {
+; CHECK-LABEL: fun7_fsub:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: wfsxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r3), 3
+; CHECK-NEXT: vl %v2, 48(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfsxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r3), 3
+; CHECK-NEXT: vl %v2, 80(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfsxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r3), 3
+; CHECK-NEXT: wfsxb %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load fp128, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1
+ %1 = load fp128, ptr %arrayidx1, align 8
+ %sub = fsub reassoc nsz arcp contract afn fp128 %0, %1
+ %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2
+ %2 = load fp128, ptr %arrayidx2, align 8
+ %sub3 = fsub reassoc nsz arcp contract afn fp128 %sub, %2
+ %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3
+ %3 = load fp128, ptr %arrayidx4, align 8
+ %sub5 = fsub reassoc nsz arcp contract afn fp128 %sub3, %3
+ %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4
+ %4 = load fp128, ptr %arrayidx6, align 8
+ %sub7 = fsub reassoc nsz arcp contract afn fp128 %sub5, %4
+ %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5
+ %5 = load fp128, ptr %arrayidx8, align 8
+ %sub9 = fsub reassoc nsz arcp contract afn fp128 %sub7, %5
+ %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6
+ %6 = load fp128, ptr %arrayidx10, align 8
+ %sub11 = fsub reassoc nsz arcp contract afn fp128 %sub9, %6
+ %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7
+ %7 = load fp128, ptr %arrayidx12, align 8
+ %sub13 = fsub reassoc nsz arcp contract afn fp128 %sub11, %7
+ ret fp128 %sub13
+}
+
+define <2 x double> @fun8_fsub(ptr %x) {
+; CHECK-LABEL: fun8_fsub:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfsdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfsdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfsdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfsdb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load <2 x double>, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1
+ %1 = load <2 x double>, ptr %arrayidx1, align 8
+ %sub = fsub reassoc nsz arcp contract afn <2 x double> %0, %1
+ %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2
+ %2 = load <2 x double>, ptr %arrayidx2, align 8
+ %sub3 = fsub reassoc nsz arcp contract afn <2 x double> %sub, %2
+ %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3
+ %3 = load <2 x double>, ptr %arrayidx4, align 8
+ %sub5 = fsub reassoc nsz arcp contract afn <2 x double> %sub3, %3
+ %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4
+ %4 = load <2 x double>, ptr %arrayidx6, align 8
+ %sub7 = fsub reassoc nsz arcp contract afn <2 x double> %sub5, %4
+ %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5
+ %5 = load <2 x double>, ptr %arrayidx8, align 8
+ %sub9 = fsub reassoc nsz arcp contract afn <2 x double> %sub7, %5
+ %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6
+ %6 = load <2 x double>, ptr %arrayidx10, align 8
+ %sub11 = fsub reassoc nsz arcp contract afn <2 x double> %sub9, %6
+ %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7
+ %7 = load <2 x double>, ptr %arrayidx12, align 8
+ %sub13 = fsub reassoc nsz arcp contract afn <2 x double> %sub11, %7
+ ret <2 x double> %sub13
+}
+
+define <4 x float> @fun9_fsub(ptr %x) {
+; CHECK-LABEL: fun9_fsub:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfssb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfssb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfssb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfssb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load <4 x float>, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1
+ %1 = load <4 x float>, ptr %arrayidx1, align 8
+ %sub = fsub reassoc nsz arcp contract afn <4 x float> %0, %1
+ %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2
+ %2 = load <4 x float>, ptr %arrayidx2, align 8
+ %sub3 = fsub reassoc nsz arcp contract afn <4 x float> %sub, %2
+ %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3
+ %3 = load <4 x float>, ptr %arrayidx4, align 8
+ %sub5 = fsub reassoc nsz arcp contract afn <4 x float> %sub3, %3
+ %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4
+ %4 = load <4 x float>, ptr %arrayidx6, align 8
+ %sub7 = fsub reassoc nsz arcp contract afn <4 x float> %sub5, %4
+ %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5
+ %5 = load <4 x float>, ptr %arrayidx8, align 8
+ %sub9 = fsub reassoc nsz arcp contract afn <4 x float> %sub7, %5
+ %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6
+ %6 = load <4 x float>, ptr %arrayidx10, align 8
+ %sub11 = fsub reassoc nsz arcp contract afn <4 x float> %sub9, %6
+ %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7
+ %7 = load <4 x float>, ptr %arrayidx12, align 8
+ %sub13 = fsub reassoc nsz arcp contract afn <4 x float> %sub11, %7
+ ret <4 x float> %sub13
+}
+
+define double @fun10_fmul(ptr %x) {
+; CHECK-LABEL: fun10_fmul:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ld %f0, 8(%r2)
+; CHECK-NEXT: mdb %f0, 0(%r2)
+; CHECK-NEXT: ld %f1, 24(%r2)
+; CHECK-NEXT: mdb %f1, 16(%r2)
+; CHECK-NEXT: mdbr %f0, %f1
+; CHECK-NEXT: ld %f1, 40(%r2)
+; CHECK-NEXT: mdb %f1, 32(%r2)
+; CHECK-NEXT: mdb %f1, 48(%r2)
+; CHECK-NEXT: mdbr %f0, %f1
+; CHECK-NEXT: mdb %f0, 56(%r2)
+; CHECK-NEXT: br %r14
+
+; PASSOUTPUT: name: fun10_fmul
+; PASSOUTPUT-NOT: WFMDB
+; PASSOUTPUT: WFMDB killed %3, killed %18, implicit $fpc
+; PASSOUTPUT-NOT: WFMDB {{.*}}$cc
+; PASSOUTPUT-NOT: WFMDB_CCPseudo
+entry:
+ %0 = load double, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %1 = load double, ptr %arrayidx1, align 8
+ %mul = fmul reassoc nsz arcp contract afn double %0, %1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %2 = load double, ptr %arrayidx2, align 8
+ %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+ %3 = load double, ptr %arrayidx4, align 8
+ %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3
+ %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4
+ %4 = load double, ptr %arrayidx6, align 8
+ %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4
+ %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5
+ %5 = load double, ptr %arrayidx8, align 8
+ %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5
+ %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6
+ %6 = load double, ptr %arrayidx10, align 8
+ %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6
+ %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7
+ %7 = load double, ptr %arrayidx12, align 8
+ %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7
+ ret double %mul13
+}
+
+define float @fun11_fmul(ptr %x) {
+; CHECK-LABEL: fun11_fmul:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lde %f0, 4(%r2)
+; CHECK-NEXT: meeb %f0, 0(%r2)
+; CHECK-NEXT: lde %f1, 12(%r2)
+; CHECK-NEXT: meeb %f1, 8(%r2)
+; CHECK-NEXT: meebr %f0, %f1
+; CHECK-NEXT: lde %f1, 20(%r2)
+; CHECK-NEXT: meeb %f1, 16(%r2)
+; CHECK-NEXT: meeb %f1, 24(%r2)
+; CHECK-NEXT: meebr %f0, %f1
+; CHECK-NEXT: meeb %f0, 28(%r2)
+; CHECK-NEXT: br %r14
+
+; PASSOUTPUT: name: fun11_fmul
+; PASSOUTPUT-NOT: WFMSB
+; PASSOUTPUT: WFMSB killed %3, killed %18, implicit $fpc
+; PASSOUTPUT-NOT: WFMSB {{.*}}$cc
+; PASSOUTPUT-NOT: WFMSB_CCPseudo
+entry:
+ %0 = load float, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+ %1 = load float, ptr %arrayidx1, align 8
+ %mul = fmul reassoc nsz arcp contract afn float %0, %1
+ %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2
+ %2 = load float, ptr %arrayidx2, align 8
+ %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2
+ %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3
+ %3 = load float, ptr %arrayidx4, align 8
+ %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3
+ %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4
+ %4 = load float, ptr %arrayidx6, align 8
+ %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4
+ %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5
+ %5 = load float, ptr %arrayidx8, align 8
+ %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5
+ %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6
+ %6 = load float, ptr %arrayidx10, align 8
+ %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6
+ %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7
+ %7 = load float, ptr %arrayidx12, align 8
+ %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7
+ ret float %mul13
+}
+
+define fp128 @fun12_fmul(ptr %x) {
+; CHECK-LABEL: fun12_fmul:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: wfmxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r3), 3
+; CHECK-NEXT: vl %v2, 48(%r3), 3
+; CHECK-NEXT: wfmxb %v1, %v1, %v2
+; CHECK-NEXT: wfmxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r3), 3
+; CHECK-NEXT: vl %v2, 80(%r3), 3
+; CHECK-NEXT: wfmxb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r3), 3
+; CHECK-NEXT: wfmxb %v1, %v1, %v2
+; CHECK-NEXT: wfmxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r3), 3
+; CHECK-NEXT: wfmxb %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load fp128, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1
+ %1 = load fp128, ptr %arrayidx1, align 8
+ %mul = fmul reassoc nsz arcp contract afn fp128 %0, %1
+ %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2
+ %2 = load fp128, ptr %arrayidx2, align 8
+ %mul3 = fmul reassoc nsz arcp contract afn fp128 %mul, %2
+ %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3
+ %3 = load fp128, ptr %arrayidx4, align 8
+ %mul5 = fmul reassoc nsz arcp contract afn fp128 %mul3, %3
+ %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4
+ %4 = load fp128, ptr %arrayidx6, align 8
+ %mul7 = fmul reassoc nsz arcp contract afn fp128 %mul5, %4
+ %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5
+ %5 = load fp128, ptr %arrayidx8, align 8
+ %mul9 = fmul reassoc nsz arcp contract afn fp128 %mul7, %5
+ %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6
+ %6 = load fp128, ptr %arrayidx10, align 8
+ %mul11 = fmul reassoc nsz arcp contract afn fp128 %mul9, %6
+ %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7
+ %7 = load fp128, ptr %arrayidx12, align 8
+ %mul13 = fmul reassoc nsz arcp contract afn fp128 %mul11, %7
+ ret fp128 %mul13
+}
+
+define <2 x double> @fun13_fmul(ptr %x) {
+; CHECK-LABEL: fun13_fmul:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfmdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfmdb %v1, %v1, %v2
+; CHECK-NEXT: vfmdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfmdb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfmdb %v1, %v1, %v2
+; CHECK-NEXT: vfmdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfmdb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load <2 x double>, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1
+ %1 = load <2 x double>, ptr %arrayidx1, align 8
+ %mul = fmul reassoc nsz arcp contract afn <2 x double> %0, %1
+ %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2
+ %2 = load <2 x double>, ptr %arrayidx2, align 8
+ %mul3 = fmul reassoc nsz arcp contract afn <2 x double> %mul, %2
+ %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3
+ %3 = load <2 x double>, ptr %arrayidx4, align 8
+ %mul5 = fmul reassoc nsz arcp contract afn <2 x double> %mul3, %3
+ %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4
+ %4 = load <2 x double>, ptr %arrayidx6, align 8
+ %mul7 = fmul reassoc nsz arcp contract afn <2 x double> %mul5, %4
+ %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5
+ %5 = load <2 x double>, ptr %arrayidx8, align 8
+ %mul9 = fmul reassoc nsz arcp contract afn <2 x double> %mul7, %5
+ %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6
+ %6 = load <2 x double>, ptr %arrayidx10, align 8
+ %mul11 = fmul reassoc nsz arcp contract afn <2 x double> %mul9, %6
+ %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7
+ %7 = load <2 x double>, ptr %arrayidx12, align 8
+ %mul13 = fmul reassoc nsz arcp contract afn <2 x double> %mul11, %7
+ ret <2 x double> %mul13
+}
+
+define <4 x float> @fun14_fmul(ptr %x) {
+; CHECK-LABEL: fun14_fmul:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfmsb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfmsb %v1, %v1, %v2
+; CHECK-NEXT: vfmsb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfmsb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfmsb %v1, %v1, %v2
+; CHECK-NEXT: vfmsb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfmsb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
+entry:
+ %0 = load <4 x float>, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1
+ %1 = load <4 x float>, ptr %arrayidx1, align 8
+ %mul = fmul reassoc nsz arcp contract afn <4 x float> %0, %1
+ %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2
+ %2 = load <4 x float>, ptr %arrayidx2, align 8
+ %mul3 = fmul reassoc nsz arcp contract afn <4 x float> %mul, %2
+ %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3
+ %3 = load <4 x float>, ptr %arrayidx4, align 8
+ %mul5 = fmul reassoc nsz arcp contract afn <4 x float> %mul3, %3
+ %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4
+ %4 = load <4 x float>, ptr %arrayidx6, align 8
+ %mul7 = fmul reassoc nsz arcp contract afn <4 x float> %mul5, %4
+ %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5
+ %5 = load <4 x float>, ptr %arrayidx8, align 8
+ %mul9 = fmul reassoc nsz arcp contract afn <4 x float> %mul7, %5
+ %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6
+ %6 = load <4 x float>, ptr %arrayidx10, align 8
+ %mul11 = fmul reassoc nsz arcp contract afn <4 x float> %mul9, %6
+ %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7
+ %7 = load <4 x float>, ptr %arrayidx12, align 8
+ %mul13 = fmul reassoc nsz arcp contract afn <4 x float> %mul11, %7
+ ret <4 x float> %mul13
+}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
new file mode 100644
index 00000000000000..787f6c90c29e45
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
+; RUN: -print-before=machine-combiner -print-after=machine-combiner -ppc-fma \
+; RUN: 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+define double @fun0_fma2_add(ptr %x, double %A, double %B) {
+; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun0_fma2_add: IsSSA, TracksLiveness
+; CHECK: bb.0.entry:
+; CHECK-NEXT: liveins: $r2d, $f0d, $f2d
+; CHECK-NEXT: [[Y:%2:fp64bit]] = COPY $f2d
+; CHECK-NEXT: [[X:%1:fp64bit]] = COPY $f0d
+; CHECK-NEXT: %0:addr64bit = COPY $r2d
+; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
+; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo [[X]], [[Y]]
+; CHECK-NEXT: %8:vr64bit = {{.*}} WFMADB killed [[M21:%3:vr64bit]], killed [[M22:%4:vr64bit]], killed %7:vr64bit
+; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB killed [[M31:%5:vr64bit]], killed [[M32:%6:vr64bit]], killed %8:vr64bit
+; CHECK-NEXT: $f0d = COPY %9:vr64bit
+; CHECK-NEXT: Return implicit $f0d
+
+; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun0_fma2_add: IsSSA, TracksLiveness
+; CHECK: %10:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], [[X]]
+; CHECK-NEXT: %11:vr64bit = {{.*}} WFMADB killed [[M31]], killed [[M32]], [[Y]]
+; CHECK-NEXT: %9:vr64bit = {{.*}} WFADB_CCPseudo %10:vr64bit, %11:vr64bit
+; CHECK-NEXT: $f0d = COPY %9:vr64bit
+; CHECK-NEXT: Return implicit $f0d
+entry:
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+
+ %0 = load double, ptr %x
+ %1 = load double, ptr %arrayidx1
+ %2 = load double, ptr %arrayidx2
+ %3 = load double, ptr %arrayidx4
+
+ %mul1 = fmul reassoc nsz contract double %0, %1
+ %mul2 = fmul reassoc nsz contract double %2, %3
+
+ %A1 = fadd reassoc nsz contract double %A, %B
+ %A2 = fadd reassoc nsz contract double %A1, %mul1
+ %A3 = fadd reassoc nsz contract double %A2, %mul2
+
+ ret double %A3
+}
+
+; Same as above, but with a long-latency factor in the root FMA which makes
+; this undesirable.
+define double @fun1_fma2_add_divop(ptr %x, double %A, double %B) {
+; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun1_fma2_add_divop: IsSSA, TracksLiveness
+; CHECK: bb.0.entry:
+; CHECK-NEXT: liveins: $r2d, $f0d, $f2d
+; CHECK-NEXT: %2:fp64bit = COPY $f2d
+; CHECK-NEXT: %1:fp64bit = COPY $f0d
+; CHECK-NEXT: %0:addr64bit = COPY $r2d
+; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
+; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: %7:vr64bit = nofpexcept WFDDB %5:vr64bit, killed %6:vr64bit, implicit $fpc
+; CHECK-NEXT: %8:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit
+; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB killed %3:vr64bit, killed %4:vr64bit, killed %8:vr64bit
+; CHECK-NEXT: %10:vr64bit = {{.*}} WFMADB %5:vr64bit, killed %7:vr64bit, killed %9:vr64bit
+; CHECK-NEXT: $f0d = COPY %10:vr64bit
+; CHECK-NEXT: Return implicit $f0d
+entry:
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+
+ %0 = load double, ptr %x
+ %1 = load double, ptr %arrayidx1
+ %2 = load double, ptr %arrayidx2
+ %3 = load double, ptr %arrayidx4
+ %div = fdiv double %2, %3
+
+ %mul1 = fmul reassoc nsz contract double %0, %1
+ %mul2 = fmul reassoc nsz contract double %2, %div
+
+ %A1 = fadd reassoc nsz contract double %A, %B
+ %A2 = fadd reassoc nsz contract double %A1, %mul1
+ %A3 = fadd reassoc nsz contract double %A2, %mul2
+
+ ret double %A3
+}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
new file mode 100644
index 00000000000000..10a671e0c68060
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
@@ -0,0 +1,123 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
+; RUN: -print-before=machine-combiner -print-after=machine-combiner -z-fma \
+; RUN: 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+; The incoming accumulator is stalling so it is worth putting the
+; multiplications in parallell with it.
+define double @fun0_fma2_divop(ptr %x) {
+; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun0_fma2_divop: IsSSA, TracksLiveness
+; CHECK: bb.0.entry:
+; CHECK-NEXT: liveins: $r2d
+; CHECK-NEXT: %0:addr64bit = COPY $r2d
+; CHECK-NEXT: [[M21:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
+; CHECK-NEXT: [[M22:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: [[M11:%3:vr64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: [[M12:%4:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB %3:vr64bit, %4:vr64bit, implicit $fpc
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], killed [[DIV]]
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB [[M11]], [[M12]], killed %6:vr64bit
+; CHECK-NEXT: $f0d = COPY %7:vr64bit
+; CHECK-NEXT: Return implicit $f0d
+
+; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun0_fma2_divop: IsSSA, TracksLiveness
+; CHECK: %8:vr64bit = {{.*}} WFMDB killed [[M21]], killed [[M22]]
+; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB [[M11]], [[M12]], %8:vr64bit
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo killed [[DIV]], %9:vr64bit
+entry:
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+
+ %0 = load double, ptr %x
+ %1 = load double, ptr %arrayidx1
+ %2 = load double, ptr %arrayidx2
+ %3 = load double, ptr %arrayidx4
+ %div = fdiv double %2, %3
+
+ %mul1 = fmul reassoc nsz contract double %0, %1
+ %mul2 = fmul reassoc nsz contract double %2, %3
+
+ %A1 = fadd reassoc nsz contract double %div, %mul1
+ %A2 = fadd reassoc nsz contract double %A1, %mul2
+
+ ret double %A2
+}
+
+; The non-profitable case:
+define double @fun1_fma2(ptr %x, double %Arg) {
+; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun1_fma2: IsSSA, TracksLiveness
+; CHECK: bb.0.entry:
+; CHECK-NEXT: liveins: $r2d, $f0d
+; CHECK-NEXT: %1:fp64bit = COPY $f0d
+; CHECK-NEXT: %0:addr64bit = COPY $r2d
+; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
+; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB killed %4:vr64bit, killed %5:vr64bit, killed %6:vr64bit
+; CHECK-NEXT: $f0d = COPY %7:vr64bit
+; CHECK-NEXT: Return implicit $f0d
+entry:
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+
+ %0 = load double, ptr %x
+ %1 = load double, ptr %arrayidx1
+ %2 = load double, ptr %arrayidx2
+ %3 = load double, ptr %arrayidx4
+
+ %mul1 = fmul reassoc nsz contract double %0, %1
+ %mul2 = fmul reassoc nsz contract double %2, %3
+
+ %A1 = fadd reassoc nsz contract double %Arg, %mul1
+ %A2 = fadd reassoc nsz contract double %A1, %mul2
+
+ ret double %A2
+}
+
+; Keep the two FMAs, but change order due to the long latency divide.
+define double @fun2_fma2(ptr %x) {
+; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun2_fma2: IsSSA, TracksLiveness
+; CHECK: bb.0.entry:
+; CHECK-NEXT: liveins: $r2d
+; CHECK-NEXT: %0:addr64bit = COPY $r2d
+; CHECK-NEXT: %1:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
+; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB %3:vr64bit, %4:vr64bit, implicit $fpc
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB killed %1:vr64bit, killed [[DIV]], killed %2:vr64bit
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB %3:vr64bit, %4:vr64bit, killed %6:vr64bit
+
+; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun2_fma2: IsSSA, TracksLiveness
+; CHECK: %12:vr64bit = {{.*}} WFMADB %3:vr64bit, %4:vr64bit, killed %2:vr64bit
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB killed %1:vr64bit, killed [[DIV]], %12:vr64bit
+
+entry:
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+
+ %0 = load double, ptr %x
+ %1 = load double, ptr %arrayidx1
+ %2 = load double, ptr %arrayidx2
+ %3 = load double, ptr %arrayidx4
+ %div = fdiv double %2, %3
+
+ %mul1 = fmul reassoc nsz contract double %0, %div
+ %mul2 = fmul reassoc nsz contract double %2, %3
+
+ %A1 = fadd reassoc nsz contract double %1, %mul1
+ %A2 = fadd reassoc nsz contract double %A1, %mul2
+
+ ret double %A2
+}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
new file mode 100644
index 00000000000000..5db80a465da5fe
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
@@ -0,0 +1,116 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
+; RUN: -print-before=machine-combiner -print-after=machine-combiner -z-fma \
+; RUN: 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+; No improvement possible.
+define double @fun0_fma1add(ptr %x) {
+; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun0_fma1add: IsSSA, TracksLiveness
+; CHECK: bb.0.entry:
+; CHECK-NEXT: liveins: $r2d
+; CHECK-NEXT: %0:addr64bit = COPY $r2d
+; CHECK-NEXT: %1:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
+; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: %5:vr64bit = {{.*}} WFADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB killed %1:vr64bit, killed %2:vr64bit, killed %5:vr64bit
+; CHECK-NEXT: $f0d = COPY %6:vr64bit
+; CHECK-NEXT: Return implicit $f0d
+entry:
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+
+ %0 = load double, ptr %x
+ %1 = load double, ptr %arrayidx1
+ %2 = load double, ptr %arrayidx2
+ %3 = load double, ptr %arrayidx4
+
+ %mul = fmul reassoc nsz contract double %0, %1
+
+ %A1 = fadd reassoc nsz contract double %2, %3
+ %A2 = fadd reassoc nsz contract double %A1, %mul
+
+ ret double %A2
+}
+
+; The RHS of the Add is stalling, so move up the FMA to the LHS.
+define double @fun1_fma1add_divop(ptr %x) {
+; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun1_fma1add_divop: IsSSA, TracksLiveness
+; CHECK: bb.0.entry:
+; CHECK-NEXT: liveins: $r2d
+; CHECK-NEXT: %0:addr64bit = COPY $r2d
+; CHECK-NEXT: [[M21:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
+; CHECK-NEXT: [[M22:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: [[T1:%3:vr64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB [[T1]], killed %4:vr64bit, implicit $fpc
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFADB_CCPseudo [[T1]], killed [[DIV]]
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], killed %6:vr64bit
+; CHECK-NEXT: $f0d = COPY %7:vr64bit
+; CHECK-NEXT: Return implicit $f0d
+
+; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun1_fma1add_divop: IsSSA, TracksLiveness
+; CHECK: %8:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], [[T1]]
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo %8:vr64bit, killed [[DIV]]
+entry:
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+
+ %0 = load double, ptr %x
+ %1 = load double, ptr %arrayidx1
+ %2 = load double, ptr %arrayidx2
+ %3 = load double, ptr %arrayidx4
+ %div = fdiv double %2, %3
+
+ %mul = fmul reassoc nsz contract double %0, %1
+
+ %A1 = fadd reassoc nsz contract double %2, %div
+ %A2 = fadd reassoc nsz contract double %A1, %mul
+
+ ret double %A2
+}
+
+; The LHS of the Add is stalling, so move up the FMA to the RHS.
+define double @fun2_fma1add_divop(ptr %x) {
+; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun2_fma1add_divop: IsSSA, TracksLiveness
+; CHECK: bb.0.entry:
+; CHECK-NEXT: liveins: $r2d
+; CHECK-NEXT: %0:addr64bit = COPY $r2d
+; CHECK-NEXT: [[M21:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
+; CHECK-NEXT: [[M22:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: [[T2:%4:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB killed %3:vr64bit, %4:vr64bit, implicit $fpc
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFADB_CCPseudo killed [[DIV]], [[T2]]
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], killed %6:vr64bit
+
+; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun2_fma1add_divop: IsSSA, TracksLiveness
+; CHECK: %9:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], [[T2]]
+; CHECK: %7:vr64bit = {{.*}} WFADB_CCPseudo %9:vr64bit, killed [[DIV]]
+entry:
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+
+ %0 = load double, ptr %x
+ %1 = load double, ptr %arrayidx1
+ %2 = load double, ptr %arrayidx2
+ %3 = load double, ptr %arrayidx4
+ %div = fdiv double %2, %3
+
+ %mul = fmul reassoc nsz contract double %0, %1
+
+ %A1 = fadd reassoc nsz contract double %div, %3
+ %A2 = fadd reassoc nsz contract double %A1, %mul
+
+ ret double %A2
+}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
new file mode 100644
index 00000000000000..885047b67c621f
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
@@ -0,0 +1,177 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 -print-before=machine-combiner \
+; RUN: -print-after=machine-combiner -debug-only=machine-combiner,systemz-II -z-fma 2>&1 \
+; RUN: | FileCheck %s
+
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 \
+; RUN: -print-after=machine-combiner -debug-only=machine-combiner,systemz-II -ppc-fma 2>&1 \
+; RUN: | FileCheck %s --check-prefix=ALT
+
+; REQUIRES: asserts
+
+; Test transformation of a sequence of 8 FMAs, with different patterns.
+
+define double @fun_fma8(ptr %x, double %A) {
+; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
+; CHECK-NEXT: # Machine code for function fun_fma8: IsSSA, TracksLiveness
+; CHECK: bb.0.entry:
+; CHECK-NEXT: liveins: $r2d, $f0d
+; CHECK-NEXT: %1:fp64bit = COPY $f0d
+; CHECK-NEXT: %0:addr64bit = COPY $r2d
+; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
+; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6)
+; CHECK-NEXT: %7:vr64bit = VL64 %0:addr64bit, 40, $noreg :: (load (s64) from %ir.arrayidx8)
+; CHECK-NEXT: %8:vr64bit = VL64 %0:addr64bit, 48, $noreg :: (load (s64) from %ir.arrayidx10)
+; CHECK-NEXT: %9:vr64bit = VL64 %0:addr64bit, 56, $noreg :: (load (s64) from %ir.arrayidx12)
+; CHECK-NEXT: %10:vr64bit = VL64 %0:addr64bit, 64, $noreg :: (load (s64) from %ir.arrayidx14)
+; CHECK-NEXT: %11:vr64bit = VL64 %0:addr64bit, 72, $noreg :: (load (s64) from %ir.arrayidx16)
+; CHECK-NEXT: %12:vr64bit = VL64 %0:addr64bit, 80, $noreg :: (load (s64) from %ir.arrayidx18)
+; CHECK-NEXT: %13:vr64bit = VL64 %0:addr64bit, 88, $noreg :: (load (s64) from %ir.arrayidx20)
+; CHECK-NEXT: %14:vr64bit = VL64 %0:addr64bit, 96, $noreg :: (load (s64) from %ir.arrayidx22)
+; CHECK-NEXT: %15:vr64bit = VL64 %0:addr64bit, 104, $noreg :: (load (s64) from %ir.arrayidx24)
+; CHECK-NEXT: %16:vr64bit = VL64 %0:addr64bit, 112, $noreg :: (load (s64) from %ir.arrayidx26)
+; CHECK-NEXT: %17:vr64bit = VL64 %0:addr64bit, 120, $noreg :: (load (s64) from %ir.arrayidx28)
+; CHECK-NEXT: %18:vr64bit = {{.*}} WFMADB killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
+; CHECK-NEXT: %19:vr64bit = {{.*}} WFMADB killed %4:vr64bit, killed %5:vr64bit, killed %18:vr64bit
+; CHECK-NEXT: %20:vr64bit = {{.*}} WFMADB killed %6:vr64bit, killed %7:vr64bit, killed %19:vr64bit
+; CHECK-NEXT: %21:vr64bit = {{.*}} WFMADB killed %8:vr64bit, killed %9:vr64bit, killed %20:vr64bit
+; CHECK-NEXT: %22:vr64bit = {{.*}} WFMADB killed %10:vr64bit, killed %11:vr64bit, killed %21:vr64bit
+; CHECK-NEXT: %23:vr64bit = {{.*}} WFMADB killed %12:vr64bit, killed %13:vr64bit, killed %22:vr64bit
+; CHECK-NEXT: %24:vr64bit = {{.*}} WFMADB killed %14:vr64bit, killed %15:vr64bit, killed %23:vr64bit
+; CHECK-NEXT: %25:vr64bit = {{.*}} WFMADB killed %16:vr64bit, killed %17:vr64bit, killed %24:vr64bit
+; CHECK-NEXT: $f0d = COPY %25:vr64bit
+; CHECK-NEXT: Return implicit $f0d
+
+; CHECK: Machine InstCombiner: fun_fma8
+; CHECK: add pattern FMA2_P1P0
+; CHECK-NEXT: add pattern FMA2_P0P1
+; CHECK-NEXT: add pattern FMA2
+; CHECK: reassociating using pattern FMA_P1P0
+; CHECK: Dependence data for %21:vr64bit = {{.*}} WFMADB
+; CHECK-NEXT: NewRootDepth: 16 RootDepth: 22 It MustReduceDepth and it does it
+; CHECK-NEXT: Resource length before replacement: 16 and after: 16
+; CHECK-NEXT: As result it IMPROVES/PRESERVES Resource Length
+; CHECK: add pattern FMA2_P1P0
+; CHECK-NEXT: add pattern FMA2_P0P1
+; CHECK-NEXT: add pattern FMA2
+; CHECK-NEXT: reassociating using pattern FMA_P1P0
+; CHECK-NEXT: Dependence data for %23:vr64bit = {{.*}} WFMADB
+; CHECK-NEXT: NewRootDepth: 22 RootDepth: 28 It MustReduceDepth and it does it
+; CHECK: Resource length before replacement: 16 and after: 16
+; CHECK-NEXT: As result it IMPROVES/PRESERVES Resource Length
+; CHECK-NEXT: add pattern FMA1_Add_L
+; CHECK-NEXT: add pattern FMA1_Add_R
+; CHECK-NEXT: reassociating using pattern FMA1_Add_L
+; CHECK-NEXT: Dependence data for %24:vr64bit = {{.*}} WFMADB
+; CHECK-NEXT: NewRootDepth: 28 RootDepth: 28 It MustReduceDepth but it does NOT do it
+; CHECK-NEXT: reassociating using pattern FMA1_Add_R
+; CHECK-NEXT: Dependence data for %24:vr64bit = {{.*}} WFMADB
+; CHECK-NEXT: NewRootDepth: 22 RootDepth: 28 It MustReduceDepth and it does it
+; CHECK-NEXT: Resource length before replacement: 16 and after: 16
+; CHECK-NEXT: As result it IMPROVES/PRESERVES Resource Length
+
+; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
+; CHECK: %18:vr64bit = {{.*}} WFMADB killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
+; CHECK-NEXT: %19:vr64bit = {{.*}} WFMADB killed %4:vr64bit, killed %5:vr64bit, killed %18:vr64bit
+; CHECK-NEXT: %36:vr64bit = {{.*}} WFMDB killed %6:vr64bit, killed %7:vr64bit
+; CHECK-NEXT: %37:vr64bit = {{.*}} WFMADB killed %8:vr64bit, killed %9:vr64bit, %36:vr64bit
+; CHECK-NEXT: %21:vr64bit = {{.*}} WFADB_CCPseudo killed %19:vr64bit, %37:vr64bit
+; CHECK-NEXT: %40:vr64bit = {{.*}} WFMDB killed %10:vr64bit, killed %11:vr64bit
+; CHECK-NEXT: %41:vr64bit = {{.*}} WFMADB killed %12:vr64bit, killed %13:vr64bit, %40:vr64bit
+; CHECK-NEXT: %43:vr64bit = {{.*}} WFMADB killed %14:vr64bit, killed %15:vr64bit, %41:vr64bit
+; CHECK-NEXT: %24:vr64bit = {{.*}} WFADB_CCPseudo %43:vr64bit, killed %21:vr64bit
+; CHECK-NEXT: %25:vr64bit = {{.*}} WFMADB killed %16:vr64bit, killed %17:vr64bit, killed %24:vr64bit
+
+; ALT: Machine InstCombiner: fun_fma8
+; ALT-NEXT: Combining MBB entry
+; ALT-NEXT: add pattern FMA3
+; ALT-NEXT: reassociating using pattern FMA3
+; ALT-NEXT: Dependence data for %20:vr64bit = {{.*}} WFMADB
+; ALT-NEXT: NewRootDepth: 16 RootDepth: 16 It MustReduceDepth but it does NOT do it
+; ALT-NEXT: add pattern FMA3
+; ALT-NEXT: reassociating using pattern FMA3
+; ALT-NEXT: Dependence data for %21:vr64bit = {{.*}} WFMADB
+; ALT-NEXT: NewRootDepth: 16 RootDepth: 22 It MustReduceDepth and it does it
+; ALT-NEXT: Resource length before replacement: 16 and after: 16
+; ALT-NEXT: As result it IMPROVES/PRESERVES Resource Length
+; ALT-NEXT: add pattern FMA2_Add
+; ALT-NEXT: reassociating using pattern FMA2_Add
+; ALT-NEXT: Dependence data for %23:vr64bit = {{.*}} WFMADB
+; ALT-NEXT: NewRootDepth: 22 RootDepth: 28 It MustReduceDepth and it does it
+; ALT-NEXT: Resource length before replacement: 16 and after: 16
+; ALT-NEXT: As result it IMPROVES/PRESERVES Resource Length
+; ALT-NEXT: add pattern FMA2_Add
+; ALT-NEXT: reassociating using pattern FMA2_Add
+; ALT-NEXT: Dependence data for %25:vr64bit = {{.*}} WFMADB
+; ALT-NEXT: NewRootDepth: 28 RootDepth: 34 It MustReduceDepth and it does it
+; ALT-NEXT: Resource length before replacement: 16 and after: 16
+; ALT-NEXT: As result it IMPROVES/PRESERVES Resource Length
+
+; ALT: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
+; ALT: %18:vr64bit = {{.*}} WFMADB killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
+; ALT-NEXT: %29:vr64bit = {{.*}} WFMDB killed %4:vr64bit, killed %5:vr64bit
+; ALT-NEXT: %30:vr64bit = {{.*}} WFMADB killed %6:vr64bit, killed %7:vr64bit, killed %18:vr64bit
+; ALT-NEXT: %31:vr64bit = {{.*}} WFMADB killed %8:vr64bit, killed %9:vr64bit, %29:vr64bit
+; ALT-NEXT: %32:vr64bit = {{.*}} WFMADB killed %10:vr64bit, killed %11:vr64bit, %30:vr64bit
+; ALT-NEXT: %33:vr64bit = {{.*}} WFMADB killed %12:vr64bit, killed %13:vr64bit, %31:vr64bit
+; ALT-NEXT: %34:vr64bit = {{.*}} WFMADB killed %14:vr64bit, killed %15:vr64bit, %32:vr64bit
+; ALT-NEXT: %35:vr64bit = {{.*}} WFMADB killed %16:vr64bit, killed %17:vr64bit, %33:vr64bit
+; ALT-NEXT: %25:vr64bit = {{.*}} WFADB_CCPseudo %34:vr64bit, %35:vr64bit
+
+entry:
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
+ %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
+ %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4
+ %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5
+ %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6
+ %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7
+ %arrayidx14 = getelementptr inbounds double, ptr %x, i64 8
+ %arrayidx16 = getelementptr inbounds double, ptr %x, i64 9
+ %arrayidx18 = getelementptr inbounds double, ptr %x, i64 10
+ %arrayidx20 = getelementptr inbounds double, ptr %x, i64 11
+ %arrayidx22 = getelementptr inbounds double, ptr %x, i64 12
+ %arrayidx24 = getelementptr inbounds double, ptr %x, i64 13
+ %arrayidx26 = getelementptr inbounds double, ptr %x, i64 14
+ %arrayidx28 = getelementptr inbounds double, ptr %x, i64 15
+
+ %0 = load double, ptr %x
+ %1 = load double, ptr %arrayidx1
+ %2 = load double, ptr %arrayidx2
+ %3 = load double, ptr %arrayidx4
+ %4 = load double, ptr %arrayidx6
+ %5 = load double, ptr %arrayidx8
+ %6 = load double, ptr %arrayidx10
+ %7 = load double, ptr %arrayidx12
+ %8 = load double, ptr %arrayidx14
+ %9 = load double, ptr %arrayidx16
+ %10 = load double, ptr %arrayidx18
+ %11 = load double, ptr %arrayidx20
+ %12 = load double, ptr %arrayidx22
+ %13 = load double, ptr %arrayidx24
+ %14 = load double, ptr %arrayidx26
+ %15 = load double, ptr %arrayidx28
+
+ %mul1 = fmul reassoc nsz contract double %0, %1
+ %mul2 = fmul reassoc nsz contract double %2, %3
+ %mul3 = fmul reassoc nsz contract double %4, %5
+ %mul4 = fmul reassoc nsz contract double %6, %7
+ %mul5 = fmul reassoc nsz contract double %8, %9
+ %mul6 = fmul reassoc nsz contract double %10, %11
+ %mul7 = fmul reassoc nsz contract double %12, %13
+ %mul8 = fmul reassoc nsz contract double %14, %15
+
+ %A1 = fadd reassoc nsz contract double %A, %mul1
+ %A2 = fadd reassoc nsz contract double %A1, %mul2
+ %A3 = fadd reassoc nsz contract double %A2, %mul3
+ %A4 = fadd reassoc nsz contract double %A3, %mul4
+ %A5 = fadd reassoc nsz contract double %A4, %mul5
+ %A6 = fadd reassoc nsz contract double %A5, %mul6
+ %A7 = fadd reassoc nsz contract double %A6, %mul7
+ %A8 = fadd reassoc nsz contract double %A7, %mul8
+
+ ret double %A8
+}
+
>From f08512bb0d4618ffb600c6c3feaa6e039aee2eab Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 4 Mar 2024 09:54:16 -0500
Subject: [PATCH 2/9] RegMem patch.
---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 2 +-
llvm/lib/CodeGen/PeepholeOptimizer.cpp | 1 +
llvm/lib/Target/SystemZ/CMakeLists.txt | 1 +
llvm/lib/Target/SystemZ/SystemZ.h | 2 +
.../Target/SystemZ/SystemZFinalizeRegMem.cpp | 94 +++++++
.../Target/SystemZ/SystemZISelLowering.cpp | 4 +
llvm/lib/Target/SystemZ/SystemZInstrFP.td | 23 +-
.../lib/Target/SystemZ/SystemZInstrFormats.td | 14 +
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 155 ++++++++++-
llvm/lib/Target/SystemZ/SystemZInstrInfo.h | 6 +
llvm/lib/Target/SystemZ/SystemZInstrVector.td | 34 +--
llvm/lib/Target/SystemZ/SystemZOperators.td | 13 +
llvm/lib/Target/SystemZ/SystemZScheduleZ13.td | 2 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ14.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ15.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ16.td | 4 +-
.../Target/SystemZ/SystemZTargetMachine.cpp | 1 +
.../CodeGen/SystemZ/fp-regmem-folding-01.ll | 251 ++++++++++++++++++
.../CodeGen/SystemZ/fp-regmem-folding-02.ll | 164 ++++++++++++
.../CodeGen/SystemZ/fp-regmem-folding-03.ll | 86 ++++++
.../CodeGen/SystemZ/fp-regmem-folding-04.ll | 62 +++++
21 files changed, 886 insertions(+), 41 deletions(-)
create mode 100644 llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp
create mode 100644 llvm/test/CodeGen/SystemZ/fp-regmem-folding-01.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-regmem-folding-03.ll
create mode 100644 llvm/test/CodeGen/SystemZ/fp-regmem-folding-04.ll
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index d5b1df2114e9e7..d09dc3a9577d64 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1710,7 +1710,7 @@ class TargetInstrInfo : public MCInstrInfo {
/// instruction that defines FoldAsLoadDefReg, and the function returns
/// the machine instruction generated due to folding.
virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI,
- const MachineRegisterInfo *MRI,
+ MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
return nullptr;
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 477a86dbe3f8c4..5cd5962d4701e4 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1868,6 +1868,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
// If we run into an instruction we can't fold across, discard
// the load candidates. Note: We might be able to fold *into* this
// instruction, so this needs to be after the folding logic.
+ // TODO: Try AA for a store?
if (MI->isLoadFoldBarrier()) {
LLVM_DEBUG(dbgs() << "Encountered load fold barrier on " << *MI);
FoldAsLoadDefCandidates.clear();
diff --git a/llvm/lib/Target/SystemZ/CMakeLists.txt b/llvm/lib/Target/SystemZ/CMakeLists.txt
index 063e5bcd44171e..6ab5d50c576ceb 100644
--- a/llvm/lib/Target/SystemZ/CMakeLists.txt
+++ b/llvm/lib/Target/SystemZ/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_target(SystemZCodeGen
SystemZConstantPoolValue.cpp
SystemZCopyPhysRegs.cpp
SystemZElimCompare.cpp
+ SystemZFinalizeRegMem.cpp
SystemZFrameLowering.cpp
SystemZHazardRecognizer.cpp
SystemZISelDAGToDAG.cpp
diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h
index d7aa9e4e18cbbb..c357c5a4250144 100644
--- a/llvm/lib/Target/SystemZ/SystemZ.h
+++ b/llvm/lib/Target/SystemZ/SystemZ.h
@@ -195,12 +195,14 @@ FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZFinalizeRegMemPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM);
FunctionPass *createSystemZTDCPass();
void initializeSystemZCopyPhysRegsPass(PassRegistry &);
void initializeSystemZDAGToDAGISelPass(PassRegistry &);
void initializeSystemZElimComparePass(PassRegistry &);
+void initializeSystemZFinalizeRegMemPass(PassRegistry &);
void initializeSystemZLDCleanupPass(PassRegistry &);
void initializeSystemZLongBranchPass(PassRegistry &);
void initializeSystemZPostRewritePass(PassRegistry &);
diff --git a/llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp b/llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp
new file mode 100644
index 00000000000000..8de409f4f6a3ee
--- /dev/null
+++ b/llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp
@@ -0,0 +1,94 @@
+//===------- SystemZFinalizeRegMem.cpp - Finalize FP reg/mem folding ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts any remaining reg/reg pseudos into the real target
+// instruction in cases where the peephole optimizer did not fold a load into
+// a reg/mem instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SystemZFinalizeRegMem : public MachineFunctionPass {
+public:
+ static char ID;
+ SystemZFinalizeRegMem()
+ : MachineFunctionPass(ID), TII(nullptr), MRI(nullptr) {
+ initializeSystemZFinalizeRegMemPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+
+ bool visitMBB(MachineBasicBlock &MBB);
+
+ const SystemZInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+};
+
+char SystemZFinalizeRegMem::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(SystemZFinalizeRegMem, "systemz-finalize-regmem",
+ "SystemZ Finalize RegMem", false, false)
+
+FunctionPass *llvm::
+createSystemZFinalizeRegMemPass(SystemZTargetMachine &TM) {
+ return new SystemZFinalizeRegMem();
+}
+
+void SystemZFinalizeRegMem::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool SystemZFinalizeRegMem::visitMBB(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ for (MachineInstr &MI : MBB) {
+ unsigned PseudoOpcode = MI.getOpcode();
+ unsigned TargetOpcode =
+ PseudoOpcode == SystemZ::WFADB_CCPseudo ? SystemZ::WFADB
+ : PseudoOpcode == SystemZ::WFASB_CCPseudo ? SystemZ::WFASB
+ : PseudoOpcode == SystemZ::WFSDB_CCPseudo ? SystemZ::WFSDB
+ : PseudoOpcode == SystemZ::WFSSB_CCPseudo ? SystemZ::WFSSB
+ : 0;
+ if (TargetOpcode) {
+ MI.setDesc(TII->get(TargetOpcode));
+ int CCIdx = MI.findRegisterDefOperandIdx(SystemZ::CC);
+ MI.removeOperand(CCIdx);
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+bool SystemZFinalizeRegMem::runOnMachineFunction(MachineFunction &F) {
+ TII = F.getSubtarget<SystemZSubtarget>().getInstrInfo();
+ MRI = &F.getRegInfo();
+
+ bool Modified = false;
+ for (auto &MBB : F)
+ Modified |= visitMBB(MBB);
+
+ return Modified;
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2da4431cf077eb..2e4e368f3d6779 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -692,6 +692,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
}
+ // Don't select reg/mem LDEB if WLDEB is available.
+ if (Subtarget.hasVector())
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+
// Floating-point truncation and stores need to be done separately.
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index f4b5aeaebef923..7f9ae518f7aaf2 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -201,6 +201,7 @@ let Predicates = [FeatureNoVectorEnhancements1] in {
// Extend memory floating-point values to wider representations.
let Uses = [FPC], mayRaiseFPException = 1 in {
def LDEB : UnaryRXE<"ldeb", 0xED04, z_any_extloadf32, FP64, 4>;
+ def LDEB : UnaryRXE<"ldeb", 0xED04, z_fpr_any_extloadf32, FP64, 4>;
def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>;
def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>;
}
@@ -362,8 +363,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
def SQDBR : UnaryRRE<"sqdbr", 0xB315, any_fsqrt, FP64, FP64>;
def SQXBR : UnaryRRE<"sqxbr", 0xB316, any_fsqrt, FP128, FP128>;
- def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<any_fsqrt>, FP32, 4>;
- def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<any_fsqrt>, FP64, 8>;
+ def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<any_fsqrt, z_fprload>, FP32, 4>;
+ def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<any_fsqrt, z_fprload>, FP64, 8>;
}
// Round to an integer, with the second operand (modifier M3) specifying
@@ -432,6 +433,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
}
defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_load, 4>;
defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_load, 8>;
+ defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_fprload, 4>;
+ defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_fprload, 8>;
}
// Subtraction.
@@ -443,6 +446,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, any_fsub, FP32, z_load, 4>;
defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, any_fsub, FP64, z_load, 8>;
+ defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, any_fsub, FP32, z_fprload, 4>;
+ defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, any_fsub, FP64, z_fprload, 8>;
}
// Multiplication.
@@ -454,6 +459,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
}
defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_load, 4>;
defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, any_fmul, FP64, z_load, 8>;
+ defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_fprload, 4>;
+ defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, any_fmul, FP64, z_fprload, 8>;
}
// f64 multiplication of two FP32 registers.
@@ -497,6 +504,10 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, z_load, 4>;
defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, z_load, 8>;
+ defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32,
+ z_fprload, 4>;
+ defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64,
+ z_fprload, 8>;
}
// Fused multiply-subtract.
@@ -506,6 +517,10 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, z_load, 4>;
defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, z_load, 8>;
+ defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32,
+ z_fprload, 4>;
+ defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64,
+ z_fprload, 8>;
}
// Division.
@@ -516,6 +531,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, z_load, 4>;
defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, z_load, 8>;
+ defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, z_fprload, 4>;
+ defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, z_fprload, 8>;
}
// Divide to integer.
@@ -535,6 +552,8 @@ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC], CCValues = 0xF in {
def CEB : CompareRXE<"ceb", 0xED09, z_any_fcmp, FP32, z_load, 4>;
def CDB : CompareRXE<"cdb", 0xED19, z_any_fcmp, FP64, z_load, 8>;
+ def CEB : CompareRXE<"ceb", 0xED09, z_any_fcmp, FP32, z_fprload, 4>;
+ def CDB : CompareRXE<"cdb", 0xED19, z_any_fcmp, FP64, z_fprload, 8>;
def KEBR : CompareRRE<"kebr", 0xB308, z_strict_fcmps, FP32, FP32>;
def KDBR : CompareRRE<"kdbr", 0xB318, z_strict_fcmps, FP64, FP64>;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 3dba33b66bf4f4..4c804832eb1a20 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5536,3 +5536,17 @@ multiclass StringRRE<string mnemonic, bits<16> opcode,
[(set GR64:$end, (operator GR64:$start1, GR64:$start2,
GR32:$char))]>;
}
+
+multiclass BinaryVRRcAndCCPseudo<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> type = 0,
+ bits<4> m5 = 0, bits<4> m6 = 0,
+ string fp_mnemonic = ""> {
+ def "" : BinaryVRRc<mnemonic, opcode, null_frag, tr1, tr2, type, m5, m6,
+ fp_mnemonic>;
+ let Defs = [CC] in
+ def _CCPseudo : Pseudo<(outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
+ [(set (tr1.vt tr1.op:$V1),
+ (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3)))]>;
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 7f850a43a31f0f..a31730e09710e6 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -611,6 +611,147 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
.addImm(CCValid).addImm(CCMask);
}
+static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) {
+ if (OldMI->registerDefIsDead(SystemZ::CC)) {
+ MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC);
+ if (CCDef != nullptr)
+ CCDef->setIsDead(true);
+ }
+}
+
+void SystemZInstrInfo::transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI,
+ MachineInstr::MIFlag Flag) const {
+ if (OldMI->getFlag(Flag))
+ NewMI->setFlag(Flag);
+}
+
+static cl::opt<bool> DISABLE_FOLDING("disable-folding", cl::init(false));
+static cl::opt<bool> FOLD_LDY("fold-ldy", cl::init(false));
+
+MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
+ MachineRegisterInfo *MRI,
+ Register &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const {
+ // TODO: Would it be beneficial to not fold in cases of high register pressure?
+ if (DISABLE_FOLDING)
+ return nullptr;
+
+ const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+
+ // Check whether we can move the DefMI load, and that it only has one use.
+ DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+ assert(DefMI);
+ bool SawStore = false;
+ if (!DefMI->isSafeToMove(nullptr, SawStore) ||
+ !MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
+ return nullptr;
+
+ unsigned LoadOpcD12 = 0;
+ unsigned LoadOpcD20 = 0;
+ unsigned RegMemOpcode = 0;
+ const TargetRegisterClass *FPRC = nullptr;
+ RegMemOpcode = MI.getOpcode() == SystemZ::WFADB_CCPseudo ? SystemZ::ADB
+ : MI.getOpcode() == SystemZ::WFSDB_CCPseudo ? SystemZ::SDB
+ : MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
+ : MI.getOpcode() == SystemZ::WFDDB ? SystemZ::DDB
+ : MI.getOpcode() == SystemZ::WFMADB ? SystemZ::MADB
+ : MI.getOpcode() == SystemZ::WFMSDB ? SystemZ::MSDB
+ : MI.getOpcode() == SystemZ::WFSQDB ? SystemZ::SQDB
+ : MI.getOpcode() == SystemZ::WFCDB ? SystemZ::CDB
+ : 0;
+ if (RegMemOpcode) {
+ LoadOpcD12 = SystemZ::VL64;
+ LoadOpcD20 = SystemZ::LDY;
+ FPRC = &SystemZ::FP64BitRegClass;
+ } else {
+ RegMemOpcode = MI.getOpcode() == SystemZ::WFASB_CCPseudo ? SystemZ::AEB
+ : MI.getOpcode() == SystemZ::WFSSB_CCPseudo ? SystemZ::SEB
+ : MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
+ : MI.getOpcode() == SystemZ::WFDSB ? SystemZ::DEB
+ : MI.getOpcode() == SystemZ::WFMASB ? SystemZ::MAEB
+ : MI.getOpcode() == SystemZ::WFMSSB ? SystemZ::MSEB
+ : MI.getOpcode() == SystemZ::WFSQSB ? SystemZ::SQEB
+ : MI.getOpcode() == SystemZ::WFCSB ? SystemZ::CEB
+ : 0;
+ if (RegMemOpcode) {
+ LoadOpcD12 = SystemZ::VL32;
+ LoadOpcD20 = SystemZ::LEY;
+ FPRC = &SystemZ::FP32BitRegClass;
+ }
+ }
+ if (MI.getOpcode() == SystemZ::WLDEB) {
+ RegMemOpcode = SystemZ::LDEB;
+ LoadOpcD12 = SystemZ::VL32;
+ LoadOpcD20 = SystemZ::LEY;
+ FPRC = &SystemZ::FP64BitRegClass;
+ }
+
+ if (!RegMemOpcode ||
+ (DefMI->getOpcode() != LoadOpcD12 && DefMI->getOpcode() != LoadOpcD20))
+ return nullptr;
+
+ if (DefMI->getOpcode() == LoadOpcD20 && !FOLD_LDY)
+ return nullptr;
+
+ DebugLoc DL = MI.getDebugLoc();
+ Register DstReg = MI.getOperand(0).getReg();
+
+ bool IsUnary = (RegMemOpcode == SystemZ::LDEB || RegMemOpcode == SystemZ::SQEB ||
+ RegMemOpcode == SystemZ::SQDB);
+ bool IsTernary =
+ (RegMemOpcode == SystemZ::MADB || RegMemOpcode == SystemZ::MAEB ||
+ RegMemOpcode == SystemZ::MSDB || RegMemOpcode == SystemZ::MSEB);
+ bool IsCmp = (RegMemOpcode == SystemZ::CEB ||RegMemOpcode == SystemZ::CDB);
+ // (TODO: handle also strict FP compares?)
+
+ MachineOperand LHS = MI.getOperand(1 - IsCmp);
+ MachineOperand RHS = MI.getOperand(2 - IsCmp);
+ MachineOperand &RegMO = RHS.getReg() == FoldAsLoadDefReg ? LHS : RHS;
+ MachineOperand *AccMO = IsTernary ? &MI.getOperand(3) : nullptr;
+ if ((RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB ||
+ RegMemOpcode == SystemZ::DDB || RegMemOpcode == SystemZ::DEB ||
+ RegMemOpcode == SystemZ::CDB || RegMemOpcode == SystemZ::CEB) &&
+ FoldAsLoadDefReg != RHS.getReg())
+ return nullptr;
+ if (IsTernary && FoldAsLoadDefReg == AccMO->getReg())
+ return nullptr;
+
+ MachineInstrBuilder MIB =
+ BuildMI(*MI.getParent(), MI, DL, get(RegMemOpcode));
+ if (!IsCmp)
+ MIB.addReg(DstReg, RegState::Define);
+ if (!IsUnary) {
+ if (IsTernary) {
+ MIB.add(*AccMO);
+ MRI->setRegClass(AccMO->getReg(), FPRC);
+ }
+ MIB.add(RegMO);
+ MRI->setRegClass(RegMO.getReg(), FPRC);
+ }
+
+ MachineOperand &Base = DefMI->getOperand(1);
+ MachineOperand &Disp = DefMI->getOperand(2);
+ MachineOperand &Indx = DefMI->getOperand(3);
+ if (Base.isReg()) // Could be a FrameIndex.
+ Base.setIsKill(false);
+ Indx.setIsKill(false);
+ if (DefMI->getOpcode() == LoadOpcD12) {
+ MIB.add(Base).add(Disp).add(Indx);
+ } else {
+ Register AddrReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ BuildMI(*MI.getParent(), *MIB, DL, get(SystemZ::LAY), AddrReg)
+ .add(Base).add(Disp).add(Indx);
+ MIB.addReg(AddrReg).addImm(0).addReg(SystemZ::NoRegister);
+ }
+ MIB.addMemOperand(*DefMI->memoperands_begin());
+ transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);
+ if (!IsCmp)
+ MIB->addRegisterDead(SystemZ::CC, TRI);
+ MRI->setRegClass(DstReg, FPRC);
+
+ return MIB;
+}
+
bool SystemZInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Register Reg,
MachineRegisterInfo *MRI) const {
@@ -938,20 +1079,6 @@ static LogicOp interpretAndImmediate(unsigned Opcode) {
}
}
-static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) {
- if (OldMI->registerDefIsDead(SystemZ::CC, /*TRI=*/nullptr)) {
- MachineOperand *CCDef =
- NewMI->findRegisterDefOperand(SystemZ::CC, /*TRI=*/nullptr);
- if (CCDef != nullptr)
- CCDef->setIsDead(true);
- }
-}
-
-static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI,
- MachineInstr::MIFlag Flag) {
- if (OldMI->getFlag(Flag))
- NewMI->setFlag(Flag);
-}
MachineInstr *
SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 09ffcb6690502e..6394a3ef925b5d 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -254,6 +254,12 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
const DebugLoc &DL, Register DstReg,
ArrayRef<MachineOperand> Cond, Register TrueReg,
Register FalseReg) const override;
+ void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI,
+ MachineInstr::MIFlag Flag) const;
+ MachineInstr *optimizeLoadInstr(MachineInstr &MI,
+ MachineRegisterInfo *MRI,
+ Register &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const override;
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
MachineRegisterInfo *MRI) const override;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 245e3c3399a986..3ec2db2610a309 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -139,7 +139,7 @@ let Predicates = [FeatureVector] in {
// LEY and LDY offer full 20-bit displacement fields. It's often better
// to use those instructions rather than force a 20-bit displacement
// into a GPR temporary.
- let mayLoad = 1 in {
+ let mayLoad = 1, canFoldAsLoad = 1 in {
def VL32 : UnaryAliasVRX<z_load, v32sb, bdxaddr12pair>;
def VL64 : UnaryAliasVRX<z_load, v64db, bdxaddr12pair>;
}
@@ -1061,15 +1061,15 @@ multiclass VectorRounding<Instruction insn, TypedReg tr> {
let Predicates = [FeatureVector] in {
// Add.
let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
- def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
- def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>;
- def WFADB : BinaryVRRc<"wfadb", 0xE7E3, any_fadd, v64db, v64db, 3, 8, 0,
- "adbr">;
+ def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
+ def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>;
+ defm WFADB : BinaryVRRcAndCCPseudo<"wfadb", 0xE7E3, any_fadd, v64db, v64db,
+ 3, 8, 0, "adbr">;
let Predicates = [FeatureVectorEnhancements1] in {
- def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>;
- def WFASB : BinaryVRRc<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, 2, 8, 0,
- "aebr">;
- def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>;
+ def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>;
+ defm WFASB : BinaryVRRcAndCCPseudo<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb,
+ 2, 8, 0, "aebr">;
+ def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>;
}
}
@@ -1389,15 +1389,15 @@ let Predicates = [FeatureVector] in {
// Subtract.
let Uses = [FPC], mayRaiseFPException = 1 in {
- def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
- def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>;
- def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, any_fsub, v64db, v64db, 3, 8, 0,
- "sdbr">;
+ def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
+ def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>;
+ defm WFSDB : BinaryVRRcAndCCPseudo<"wfsdb", 0xE7E2, any_fsub, v64db, v64db,
+ 3, 8, 0, "sdbr">;
let Predicates = [FeatureVectorEnhancements1] in {
- def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>;
- def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb, 2, 8, 0,
- "sebr">;
- def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>;
+ def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>;
+ defm WFSSB : BinaryVRRcAndCCPseudo<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb,
+ 2, 8, 0, "sebr">;
+ def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>;
}
}
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 1611436b01b7fa..4a70a72a484232 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -744,6 +744,19 @@ defm block_and : block_op<and>;
defm block_or : block_op<or>;
defm block_xor : block_op<xor>;
+// A load (into FPR) selected only if the vector facility (/f32 enhancement)
+// is not present.
+def z_fprload : PatFrag<(ops node:$ptr), (load node:$ptr),
+ [{ EVT MemVT = cast<LoadSDNode>(N)->getMemoryVT();
+ EVT LoadVT = N->getValueType(0);
+ assert(MemVT == LoadVT && "Unexpected load.");
+ if (MemVT == MVT::f32)
+ return !Subtarget->hasVectorEnhancements1();
+ return !Subtarget->hasVector();
+ }]>;
+def z_fpr_any_extloadf32 : PatFrag<(ops node:$ptr), (any_extloadf32 node:$ptr),
+ [{ return !Subtarget->hasVector(); }]>;
+
// Insertions.
def inserti8 : PatFrag<(ops node:$src1, node:$src2),
(or (and node:$src1, -256), node:$src2)>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index 9ce1a0d06b5afd..5975d884144a1a 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -1344,7 +1344,7 @@ def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
// Add / subtract
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)DB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>;
// Multiply / multiply-and-add/subtract
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index 120d4a457ee396..f18d304d7a8a37 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -1388,9 +1388,9 @@ def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
// Add / subtract
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>;
def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)SB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>;
def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
// Multiply / multiply-and-add/subtract
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index acba3a1fd9919e..52fa35c5038ccf 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -1431,9 +1431,9 @@ def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
// Add / subtract
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>;
def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
// Multiply / multiply-and-add/subtract
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index dd82b2b9b71e75..975671d1a24436 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -1437,9 +1437,9 @@ def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
// Add / subtract
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>;
def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
// Multiply / multiply-and-add/subtract
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 2436df0c8fb0a2..75b8f00e026d73 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -259,6 +259,7 @@ bool SystemZPassConfig::addILPOpts() {
void SystemZPassConfig::addPreRegAlloc() {
addPass(createSystemZCopyPhysRegsPass(getSystemZTargetMachine()));
+ addPass(createSystemZFinalizeRegMemPass(getSystemZTargetMachine()));
}
void SystemZPassConfig::addPostRewrite() {
diff --git a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-01.ll b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-01.ll
new file mode 100644
index 00000000000000..84726e889d3577
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-01.ll
@@ -0,0 +1,251 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -O3 -print-before=peephole-opt \
+; RUN: -print-after=peephole-opt,systemz-finalize-regmem -verify-machineinstrs 2>&1 \
+; RUN: | FileCheck %s
+
+define void @f0(float %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: [[LD1:%[0-9]+:vr32bit]] = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
+; CHECK-NEXT: vr32bit = nofpexcept WFASB_CCPseudo %0:fp32bit, killed [[LD1]], implicit-def dead $cc, implicit $fpc
+; CHECK: [[LD2:%[0-9]+:vr32bit]] = VL32 %2:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
+; CHECK-NEXT: vr32bit = nofpexcept WFASB_CCPseudo %0:fp32bit, [[LD2]], implicit-def dead $cc, implicit $fpc
+; CHECK-NEXT: VST32 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp32bit = nofpexcept AEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s32) from %ir.src1)
+; CHECK: %8:vr32bit = nofpexcept WFASB_CCPseudo %0:fp32bit, [[LD2]], implicit-def dead $cc, implicit $fpc
+
+; CHECK: # *** IR Dump After SystemZ Finalize RegMem (systemz-finalize-regmem) ***:
+; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp32bit = nofpexcept AEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s32) from %ir.src1)
+; CHECK: %8:vr32bit = nofpexcept WFASB %0:fp32bit, [[LD2]], implicit $fpc
+
+ %l1 = load float, ptr %src1
+ %res1 = fadd float %a1, %l1
+ store volatile float %res1, ptr %dst
+
+ %l2 = load float, ptr %src2
+ %res2 = fadd float %a1, %l2
+ store volatile float %l2, ptr %dst
+ store volatile float %res2, ptr %dst
+
+ ret void
+}
+
+define void @f1(double %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: [[LD1:%[0-9]+:vr64bit]] = VL64 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s64) from %ir.src1)
+; CHECK-NEXT: vr64bit = nofpexcept WFADB_CCPseudo %0:fp64bit, killed [[LD1]], implicit-def dead $cc, implicit $fpc
+; CHECK: [[LD2:%[0-9]+:vr64bit]] = VL64 %2:addr64bit, 0, $noreg :: (load (s64) from %ir.src2)
+; CHECK-NEXT: vr64bit = nofpexcept WFADB_CCPseudo %0:fp64bit, [[LD2]], implicit-def dead $cc, implicit $fpc
+; CHECK-NEXT: VST64 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp64bit = nofpexcept ADB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s64) from %ir.src1)
+; CHECK: %8:vr64bit = nofpexcept WFADB_CCPseudo %0:fp64bit, [[LD2]], implicit-def dead $cc, implicit $fpc
+
+; CHECK: # *** IR Dump After SystemZ Finalize RegMem (systemz-finalize-regmem) ***:
+; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp64bit = nofpexcept ADB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s64) from %ir.src1)
+; CHECK: %8:vr64bit = nofpexcept WFADB %0:fp64bit, [[LD2]], implicit $fpc
+
+ %l1 = load double, ptr %src1
+ %res1 = fadd double %a1, %l1
+ store volatile double %res1, ptr %dst
+
+ %l2 = load double, ptr %src2
+ %res2 = fadd double %a1, %l2
+ store volatile double %l2, ptr %dst
+ store volatile double %res2, ptr %dst
+
+ ret void
+}
+
+define void @f2(float %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: [[LD1:%[0-9]+:vr32bit]] = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
+; CHECK-NEXT: vr32bit = nofpexcept WFSSB_CCPseudo %0:fp32bit, killed [[LD1]], implicit-def dead $cc, implicit $fpc
+; CHECK: [[LD2:%[0-9]+:vr32bit]] = VL32 %2:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
+; CHECK-NEXT: vr32bit = nofpexcept WFSSB_CCPseudo %0:fp32bit, [[LD2]], implicit-def dead $cc, implicit $fpc
+; CHECK-NEXT: VST32 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp32bit = nofpexcept SEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s32) from %ir.src1)
+; CHECK: %8:vr32bit = nofpexcept WFSSB_CCPseudo %0:fp32bit, [[LD2]], implicit-def dead $cc, implicit $fpc
+
+; CHECK: # *** IR Dump After SystemZ Finalize RegMem (systemz-finalize-regmem) ***:
+; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp32bit = nofpexcept SEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s32) from %ir.src1)
+; CHECK: %8:vr32bit = nofpexcept WFSSB %0:fp32bit, [[LD2]], implicit $fpc
+
+ %l1 = load float, ptr %src1
+ %res1 = fsub float %a1, %l1
+ store volatile float %res1, ptr %dst
+
+ %l2 = load float, ptr %src2
+ %res2 = fsub float %a1, %l2
+ store volatile float %l2, ptr %dst
+ store volatile float %res2, ptr %dst
+
+ ret void
+}
+
+define void @f3(double %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f3: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: [[LD1:%[0-9]+:vr64bit]] = VL64 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s64) from %ir.src1)
+; CHECK-NEXT: vr64bit = nofpexcept WFSDB_CCPseudo %0:fp64bit, killed [[LD1]], implicit-def dead $cc, implicit $fpc
+; CHECK: [[LD2:%[0-9]+:vr64bit]] = VL64 %2:addr64bit, 0, $noreg :: (load (s64) from %ir.src2)
+; CHECK-NEXT: vr64bit = nofpexcept WFSDB_CCPseudo %0:fp64bit, [[LD2]], implicit-def dead $cc, implicit $fpc
+; CHECK-NEXT: VST64 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f3: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp64bit = nofpexcept SDB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s64) from %ir.src1)
+; CHECK: %8:vr64bit = nofpexcept WFSDB_CCPseudo %0:fp64bit, [[LD2]], implicit-def dead $cc, implicit $fpc
+
+; CHECK: # *** IR Dump After SystemZ Finalize RegMem (systemz-finalize-regmem) ***:
+; CHECK-NEXT: # Machine code for function f3: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp64bit = nofpexcept SDB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s64) from %ir.src1)
+; CHECK: %8:vr64bit = nofpexcept WFSDB %0:fp64bit, [[LD2]], implicit $fpc
+
+ %l1 = load double, ptr %src1
+ %res1 = fsub double %a1, %l1
+ store volatile double %res1, ptr %dst
+
+ %l2 = load double, ptr %src2
+ %res2 = fsub double %a1, %l2
+ store volatile double %l2, ptr %dst
+ store volatile double %res2, ptr %dst
+
+ ret void
+}
+
+define void @f4(float %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
+; CHECK-LABEL: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f4: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: [[LD1:%[0-9]+:vr32bit]] = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
+; CHECK-NEXT: vr32bit = nofpexcept WFMSB %0:fp32bit, killed [[LD1]], implicit $fpc
+; CHECK: [[LD2:%[0-9]+:vr32bit]] = VL32 %2:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
+; CHECK-NEXT: vr32bit = nofpexcept WFMSB %0:fp32bit, [[LD2]], implicit $fpc
+; CHECK-NEXT: VST32 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f4: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp32bit = nofpexcept MEEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit $fpc :: (load (s32) from %ir.src1)
+; CHECK: %8:vr32bit = nofpexcept WFMSB %0:fp32bit, [[LD2]], implicit $fpc
+
+ %l1 = load float, ptr %src1
+ %res1 = fmul float %a1, %l1
+ store volatile float %res1, ptr %dst
+
+ %l2 = load float, ptr %src2
+ %res2 = fmul float %a1, %l2
+ store volatile float %l2, ptr %dst
+ store volatile float %res2, ptr %dst
+
+ ret void
+}
+
+define void @f5(double %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
+; CHECK-LABEL: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f5: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: [[LD1:%[0-9]+:vr64bit]] = VL64 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s64) from %ir.src1)
+; CHECK-NEXT: vr64bit = nofpexcept WFMDB %0:fp64bit, killed [[LD1]], implicit $fpc
+; CHECK: [[LD2:%[0-9]+:vr64bit]] = VL64 %2:addr64bit, 0, $noreg :: (load (s64) from %ir.src2)
+; CHECK-NEXT: vr64bit = nofpexcept WFMDB %0:fp64bit, [[LD2]], implicit $fpc
+; CHECK-NEXT: VST64 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f5: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp64bit = nofpexcept MDB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit $fpc :: (load (s64) from %ir.src1)
+; CHECK: %8:vr64bit = nofpexcept WFMDB %0:fp64bit, [[LD2]], implicit $fpc
+
+ %l1 = load double, ptr %src1
+ %res1 = fmul double %a1, %l1
+ store volatile double %res1, ptr %dst
+
+ %l2 = load double, ptr %src2
+ %res2 = fmul double %a1, %l2
+ store volatile double %l2, ptr %dst
+ store volatile double %res2, ptr %dst
+
+ ret void
+}
+
+define void @f6(float %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
+; CHECK-LABEL: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f6: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: [[LD1:%[0-9]+:vr32bit]] = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
+; CHECK-NEXT: vr32bit = nofpexcept WFDSB %0:fp32bit, killed [[LD1]], implicit $fpc
+; CHECK: [[LD2:%[0-9]+:vr32bit]] = VL32 %2:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
+; CHECK-NEXT: vr32bit = nofpexcept WFDSB %0:fp32bit, [[LD2]], implicit $fpc
+; CHECK-NEXT: VST32 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f6: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp32bit = nofpexcept DEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit $fpc :: (load (s32) from %ir.src1)
+; CHECK: %8:vr32bit = nofpexcept WFDSB %0:fp32bit, [[LD2]], implicit $fpc
+
+ %l1 = load float, ptr %src1
+ %res1 = fdiv float %a1, %l1
+ store volatile float %res1, ptr %dst
+
+ %l2 = load float, ptr %src2
+ %res2 = fdiv float %a1, %l2
+ store volatile float %l2, ptr %dst
+ store volatile float %res2, ptr %dst
+
+ ret void
+}
+
+define void @f7(double %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
+; CHECK-LABEL: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f7: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: [[LD1:%[0-9]+:vr64bit]] = VL64 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s64) from %ir.src1)
+; CHECK-NEXT: vr64bit = nofpexcept WFDDB %0:fp64bit, killed [[LD1]], implicit $fpc
+; CHECK: [[LD2:%[0-9]+:vr64bit]] = VL64 %2:addr64bit, 0, $noreg :: (load (s64) from %ir.src2)
+; CHECK-NEXT: vr64bit = nofpexcept WFDDB %0:fp64bit, [[LD2]], implicit $fpc
+; CHECK-NEXT: VST64 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f7: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp64bit = nofpexcept DDB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit $fpc :: (load (s64) from %ir.src1)
+; CHECK: %8:vr64bit = nofpexcept WFDDB %0:fp64bit, [[LD2]], implicit $fpc
+
+ %l1 = load double, ptr %src1
+ %res1 = fdiv double %a1, %l1
+ store volatile double %res1, ptr %dst
+
+ %l2 = load double, ptr %src2
+ %res2 = fdiv double %a1, %l2
+ store volatile double %l2, ptr %dst
+ store volatile double %res2, ptr %dst
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll
new file mode 100644
index 00000000000000..4fcefaefd0f324
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -O3 -print-before=peephole-opt \
+; RUN: -print-after=peephole-opt -verify-machineinstrs 2>&1 | FileCheck %s
+
+define void @f0(float %A, ptr %src, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %3:vr32bit = VL32 [[ADDR1:%[0-9]+:addr64bit]], 4, $noreg :: (load (s32) from %ir.arrayidx1)
+; CHECK-NEXT: %4:vr32bit = VL32 %1:addr64bit, 8, $noreg :: (load (s32) from %ir.arrayidx2)
+; CHECK-NEXT: vr32bit = contract nofpexcept WFMASB killed %3:vr32bit, killed %4:vr32bit, %0:fp32bit, implicit $fpc
+; CHECK: %6:vr32bit = VL32 %1:addr64bit, 12, $noreg :: (load (s32) from %ir.arrayidx3)
+; CHECK-NEXT: %7:vr32bit = VL32 %1:addr64bit, 16, $noreg :: (load (s32) from %ir.arrayidx4)
+; CHECK-NEXT: %8:vr32bit = contract nofpexcept WFMASB %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit $fpc
+; CHECK-NEXT: VST32 killed %8:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+; CHECK-NEXT: VST32 %6:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+; CHECK-NEXT: VST32 %7:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp32bit = nofpexcept MAEB %0:fp32bit(tied-def 0), killed %4:fp32bit, [[ADDR1]], 4, $noreg, implicit $fpc :: (load (s32) from %ir.arrayidx1)
+; CHECK: vr32bit = contract nofpexcept WFMASB %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit $fpc
+
+ %arrayidx1 = getelementptr inbounds float, ptr %src, i64 1
+ %arrayidx2 = getelementptr inbounds float, ptr %src, i64 2
+ %L1l = load float, ptr %arrayidx1
+ %L1r = load float, ptr %arrayidx2
+ %M1 = fmul contract float %L1l, %L1r
+ %A1 = fadd contract float %A, %M1
+ store volatile float %A1, ptr %dst
+
+ %arrayidx3 = getelementptr inbounds float, ptr %src, i64 3
+ %arrayidx4 = getelementptr inbounds float, ptr %src, i64 4
+ %L2l = load float, ptr %arrayidx3
+ %L2r = load float, ptr %arrayidx4
+ %M2 = fmul contract float %L2l, %L2r
+ %A2 = fadd contract float %A, %M2
+ store volatile float %A2, ptr %dst
+ store volatile float %L2l, ptr %dst
+ store volatile float %L2r, ptr %dst
+
+ ret void
+}
+
+define void @f1(double %A, ptr %src, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %3:vr64bit = VL64 [[ADDR1:%[0-9]+:addr64bit]], 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: %4:vr64bit = VL64 %1:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: vr64bit = contract nofpexcept WFMADB killed %3:vr64bit, killed %4:vr64bit, %0:fp64bit, implicit $fpc
+; CHECK: %6:vr64bit = VL64 %1:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx3)
+; CHECK-NEXT: %7:vr64bit = VL64 %1:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: %8:vr64bit = contract nofpexcept WFMADB %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit $fpc
+; CHECK-NEXT: VST64 killed %8:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+; CHECK-NEXT: VST64 %6:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+; CHECK-NEXT: VST64 %7:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp64bit = nofpexcept MADB %0:fp64bit(tied-def 0), killed %4:fp64bit, [[ADDR1]], 8, $noreg, implicit $fpc :: (load (s64) from %ir.arrayidx1)
+; CHECK: vr64bit = contract nofpexcept WFMADB %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit $fpc
+
+ %arrayidx1 = getelementptr inbounds double, ptr %src, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %src, i64 2
+ %L1l = load double, ptr %arrayidx1
+ %L1r = load double, ptr %arrayidx2
+ %M1 = fmul contract double %L1l, %L1r
+ %A1 = fadd contract double %A, %M1
+ store volatile double %A1, ptr %dst
+
+ %arrayidx3 = getelementptr inbounds double, ptr %src, i64 3
+ %arrayidx4 = getelementptr inbounds double, ptr %src, i64 4
+ %L2l = load double, ptr %arrayidx3
+ %L2r = load double, ptr %arrayidx4
+ %M2 = fmul contract double %L2l, %L2r
+ %A2 = fadd contract double %A, %M2
+ store volatile double %A2, ptr %dst
+ store volatile double %L2l, ptr %dst
+ store volatile double %L2r, ptr %dst
+
+ ret void
+}
+
+define void @f2(float %A, ptr %src, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %3:vr32bit = VL32 [[ADDR1:%[0-9]+:addr64bit]], 4, $noreg :: (load (s32) from %ir.arrayidx1)
+; CHECK-NEXT: %4:vr32bit = VL32 %1:addr64bit, 8, $noreg :: (load (s32) from %ir.arrayidx2)
+; CHECK-NEXT: vr32bit = nofpexcept WFMSSB killed %3:vr32bit, killed %4:vr32bit, %0:fp32bit, implicit $fpc
+; CHECK: %6:vr32bit = VL32 %1:addr64bit, 12, $noreg :: (load (s32) from %ir.arrayidx3)
+; CHECK-NEXT: %7:vr32bit = VL32 %1:addr64bit, 16, $noreg :: (load (s32) from %ir.arrayidx4)
+; CHECK-NEXT: %8:vr32bit = nofpexcept WFMSSB %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit $fpc
+; CHECK-NEXT: VST32 killed %8:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+; CHECK-NEXT: VST32 %6:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+; CHECK-NEXT: VST32 %7:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp32bit = nofpexcept MSEB %0:fp32bit(tied-def 0), killed %4:fp32bit, [[ADDR1]], 4, $noreg, implicit $fpc :: (load (s32) from %ir.arrayidx1)
+; CHECK: vr32bit = nofpexcept WFMSSB %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit $fpc
+ %arrayidx1 = getelementptr inbounds float, ptr %src, i64 1
+ %arrayidx2 = getelementptr inbounds float, ptr %src, i64 2
+ %L1l = load float, ptr %arrayidx1
+ %L1r = load float, ptr %arrayidx2
+ %Negacc1 = fneg float %A
+ %A1 = call float @llvm.fma.f32 (float %L1l, float %L1r, float %Negacc1)
+ store volatile float %A1, ptr %dst
+
+ %arrayidx3 = getelementptr inbounds float, ptr %src, i64 3
+ %arrayidx4 = getelementptr inbounds float, ptr %src, i64 4
+ %L2l = load float, ptr %arrayidx3
+ %L2r = load float, ptr %arrayidx4
+ %Negacc2 = fneg float %A
+ %A2 = call float @llvm.fma.f32 (float %L2l, float %L2r, float %Negacc2)
+ store volatile float %A2, ptr %dst
+ store volatile float %L2l, ptr %dst
+ store volatile float %L2r, ptr %dst
+
+ ret void
+}
+
+define void @f3(double %A, ptr %src, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f3: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %3:vr64bit = VL64 [[ADDR1:%[0-9]+:addr64bit]], 8, $noreg :: (load (s64) from %ir.arrayidx1)
+; CHECK-NEXT: %4:vr64bit = VL64 %1:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: vr64bit = nofpexcept WFMSDB killed %3:vr64bit, killed %4:vr64bit, %0:fp64bit, implicit $fpc
+; CHECK: %6:vr64bit = VL64 %1:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx3)
+; CHECK-NEXT: %7:vr64bit = VL64 %1:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx4)
+; CHECK-NEXT: %8:vr64bit = nofpexcept WFMSDB %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit $fpc
+; CHECK-NEXT: VST64 killed %8:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+; CHECK-NEXT: VST64 %6:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+; CHECK-NEXT: VST64 %7:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f3: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: fp64bit = nofpexcept MSDB %0:fp64bit(tied-def 0), killed %4:fp64bit, [[ADDR1]], 8, $noreg, implicit $fpc :: (load (s64) from %ir.arrayidx1)
+; CHECK: vr64bit = nofpexcept WFMSDB %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit $fpc
+ %arrayidx1 = getelementptr inbounds double, ptr %src, i64 1
+ %arrayidx2 = getelementptr inbounds double, ptr %src, i64 2
+ %L1l = load double, ptr %arrayidx1
+ %L1r = load double, ptr %arrayidx2
+ %Negacc1 = fneg double %A
+ %A1 = call double @llvm.fma.f64 (double %L1l, double %L1r, double %Negacc1)
+ store volatile double %A1, ptr %dst
+
+ %arrayidx3 = getelementptr inbounds double, ptr %src, i64 3
+ %arrayidx4 = getelementptr inbounds double, ptr %src, i64 4
+ %L2l = load double, ptr %arrayidx3
+ %L2r = load double, ptr %arrayidx4
+ %Negacc2 = fneg double %A
+ %A2 = call double @llvm.fma.f64 (double %L2l, double %L2r, double %Negacc2)
+ store volatile double %A2, ptr %dst
+ store volatile double %L2l, ptr %dst
+ store volatile double %L2r, ptr %dst
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-03.ll b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-03.ll
new file mode 100644
index 00000000000000..5de6bb4d6af5c5
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-03.ll
@@ -0,0 +1,86 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -O3 -print-before=peephole-opt \
+; RUN: -print-after=peephole-opt -verify-machineinstrs 2>&1 | FileCheck %s
+
+define void @f0(ptr %src1, ptr %src2, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %3:vr32bit = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
+; CHECK-NEXT: %4:vr64bit = nofpexcept WLDEB killed %3:vr32bit, implicit $fpc
+; CHECK: %5:vr32bit = VL32 %1:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
+; CHECK-NEXT: %6:vr64bit = nofpexcept WLDEB %5:vr32bit, implicit $fpc
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %4:fp64bit = nofpexcept LDEB [[ADDR1]], 0, $noreg, implicit $fpc :: (load (s32) from %ir.src1)
+; CHECK: %5:vr32bit = VL32 %1:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
+; CHECK-NEXT: %6:vr64bit = nofpexcept WLDEB %5:vr32bit, implicit $fpc
+
+ %L1 = load float, ptr %src1
+ %D1 = fpext float %L1 to double
+ store volatile double %D1, ptr %dst
+
+ %L2 = load float, ptr %src2
+ %D2 = fpext float %L2 to double
+ store volatile double %D2, ptr %dst
+ store volatile float %L2, ptr %dst
+
+ ret void
+}
+
+define void @f1(ptr %ptr, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %2:vr32bit = VL32 [[ADDR2:%0:addr64bit]], 0, $noreg :: (load (s32) from %ir.ptr)
+; CHECK-NEXT: %3:vr32bit = nofpexcept WFSQSB killed %2:vr32bit, implicit $fpc
+; CHECK: %4:vr32bit = VL32 %0:addr64bit, 0, $noreg :: (load (s32) from %ir.ptr)
+; CHECK-NEXT: %5:vr32bit = nofpexcept WFSQSB %4:vr32bit, implicit $fpc
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %3:fp32bit = nofpexcept SQEB [[ADDR2]], 0, $noreg, implicit $fpc :: (load (s32) from %ir.ptr)
+; CHECK: %4:vr32bit = VL32 %0:addr64bit, 0, $noreg :: (load (s32) from %ir.ptr)
+; CHECK-NEXT: %5:vr32bit = nofpexcept WFSQSB %4:vr32bit, implicit $fpc
+
+ %L1 = load float, ptr %ptr
+ %S1 = call float @llvm.sqrt.f32(float %L1)
+ store volatile float %S1, ptr %dst
+
+ %L2 = load float, ptr %ptr
+ %S2 = call float @llvm.sqrt.f32(float %L2)
+ store volatile float %S2, ptr %dst
+ store volatile float %L2, ptr %dst
+
+ ret void
+}
+
+define void @f2(ptr %ptr, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %2:vr64bit = VL64 [[ADDR2:%0:addr64bit]], 0, $noreg :: (load (s64) from %ir.ptr)
+; CHECK-NEXT: %3:vr64bit = nofpexcept WFSQDB killed %2:vr64bit, implicit $fpc
+; CHECK: %4:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.ptr)
+; CHECK-NEXT: %5:vr64bit = nofpexcept WFSQDB %4:vr64bit, implicit $fpc
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %3:fp64bit = nofpexcept SQDB [[ADDR2]], 0, $noreg, implicit $fpc :: (load (s64) from %ir.ptr)
+; CHECK: %4:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.ptr)
+; CHECK-NEXT: %5:vr64bit = nofpexcept WFSQDB %4:vr64bit, implicit $fpc
+
+ %L1 = load double, ptr %ptr
+ %S1 = call double @llvm.sqrt.f64(double %L1)
+ store volatile double %S1, ptr %dst
+
+ %L2 = load double, ptr %ptr
+ %S2 = call double @llvm.sqrt.f64(double %L2)
+ store volatile double %S2, ptr %dst
+ store volatile double %L2, ptr %dst
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-04.ll b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-04.ll
new file mode 100644
index 00000000000000..58710a3be6489c
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-04.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -O3 -print-before=peephole-opt \
+; RUN: -print-after=peephole-opt -verify-machineinstrs 2>&1 | FileCheck %s
+
+define void @f0(i64 %a, i64 %b, float %f1, ptr %src1, ptr %src2, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %6:vr32bit = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
+; CHECK-NEXT: nofpexcept WFCSB %2:fp32bit, killed %6:vr32bit, implicit-def $cc, implicit $fpc
+; CHECK: %9:vr32bit = VL32 %4:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
+; CHECK-NEXT: nofpexcept WFCSB %2:fp32bit, %9:vr32bit, implicit-def $cc, implicit $fpc
+; CHECK: VST32 %9:vr32bit, %5:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: nofpexcept CEB %2:fp32bit, [[ADDR1]], 0, $noreg, implicit-def $cc, implicit $fpc :: (load (s32) from %ir.src1)
+; CHECK: nofpexcept WFCSB %2:fp32bit, %9:vr32bit, implicit-def $cc, implicit $fpc
+
+ %L1 = load float, ptr %src1
+ %C1 = fcmp oeq float %f1, %L1
+ %S1 = select i1 %C1, i64 0, i64 1
+ store volatile i64 %S1, ptr %dst
+
+ %L2 = load float, ptr %src2
+ %C2 = fcmp oeq float %f1, %L2
+ %S2 = select i1 %C2, i64 0, i64 1
+ store volatile i64 %S2, ptr %dst
+ store volatile float %L2, ptr %dst
+
+ ret void
+}
+
+define void @f1(i64 %a, i64 %b, double %f1, ptr %src1, ptr %src2, ptr %dst) {
+; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: %6:vr64bit = VL64 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s64) from %ir.src1)
+; CHECK-NEXT: nofpexcept WFCDB %2:fp64bit, killed %6:vr64bit, implicit-def $cc, implicit $fpc
+; CHECK: %9:vr64bit = VL64 %4:addr64bit, 0, $noreg :: (load (s64) from %ir.src2)
+; CHECK-NEXT: nofpexcept WFCDB %2:fp64bit, %9:vr64bit, implicit-def $cc, implicit $fpc
+; CHECK: VST64 %9:vr64bit, %5:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
+
+; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
+; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
+; CHECK-LABEL: bb.0 (%ir-block.0):
+; CHECK: nofpexcept CDB %2:fp64bit, [[ADDR1]], 0, $noreg, implicit-def $cc, implicit $fpc :: (load (s64) from %ir.src1)
+; CHECK: nofpexcept WFCDB %2:fp64bit, %9:vr64bit, implicit-def $cc, implicit $fpc
+
+ %L1 = load double, ptr %src1
+ %C1 = fcmp oeq double %f1, %L1
+ %S1 = select i1 %C1, i64 0, i64 1
+ store volatile i64 %S1, ptr %dst
+
+ %L2 = load double, ptr %src2
+ %C2 = fcmp oeq double %f1, %L2
+ %S2 = select i1 %C2, i64 0, i64 1
+ store volatile i64 %S2, ptr %dst
+ store volatile double %L2, ptr %dst
+
+ ret void
+}
>From d186dd87d5e6680ec81c352ad6c01472369ac1ff Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 8 Mar 2024 14:49:46 -0500
Subject: [PATCH 3/9] Make WFMADB/WFMASB clobber CC
---
.../Target/SystemZ/SystemZFinalizeRegMem.cpp | 2 +
.../lib/Target/SystemZ/SystemZInstrFormats.td | 15 +++++
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 12 ++--
llvm/lib/Target/SystemZ/SystemZInstrVector.td | 21 ++++---
llvm/lib/Target/SystemZ/SystemZScheduleZ13.td | 2 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ14.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ15.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ16.td | 4 +-
.../CodeGen/SystemZ/fp-regmem-folding-02.ll | 12 ++--
.../SystemZ/machine-combiner-reassoc-fp-03.ll | 12 ++--
.../SystemZ/machine-combiner-reassoc-fp-04.ll | 18 +++---
.../SystemZ/machine-combiner-reassoc-fp-08.ll | 10 ++--
.../SystemZ/machine-combiner-reassoc-fp-09.ll | 58 +++++++++----------
13 files changed, 97 insertions(+), 77 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp b/llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp
index 8de409f4f6a3ee..68b2797b114e4b 100644
--- a/llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp
@@ -71,6 +71,8 @@ bool SystemZFinalizeRegMem::visitMBB(MachineBasicBlock &MBB) {
: PseudoOpcode == SystemZ::WFASB_CCPseudo ? SystemZ::WFASB
: PseudoOpcode == SystemZ::WFSDB_CCPseudo ? SystemZ::WFSDB
: PseudoOpcode == SystemZ::WFSSB_CCPseudo ? SystemZ::WFSSB
+ : PseudoOpcode == SystemZ::WFMADB_CCPseudo ? SystemZ::WFMADB
+ : PseudoOpcode == SystemZ::WFMASB_CCPseudo ? SystemZ::WFMASB
: 0;
if (TargetOpcode) {
MI.setDesc(TII->get(TargetOpcode));
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 4c804832eb1a20..5a7be17c4dc1f6 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5550,3 +5550,18 @@ multiclass BinaryVRRcAndCCPseudo<string mnemonic, bits<16> opcode,
(operator (tr2.vt tr2.op:$V2),
(tr2.vt tr2.op:$V3)))]>;
}
+
+multiclass TernaryVRReAndCCPseudo<string mnemonic, bits<16> opcode,
+ SDPatternOperator operator,
+ TypedReg tr1, TypedReg tr2, bits<4> m5 = 0,
+ bits<4> type = 0, string fp_mnemonic = ""> {
+ def "" : TernaryVRRe<mnemonic, opcode, null_frag, tr1, tr2, m5, type,
+ fp_mnemonic>;
+ let Defs = [CC] in
+ def _CCPseudo : Pseudo<(outs tr1.op:$V1),
+ (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
+ [(set (tr1.vt tr1.op:$V1),
+ (operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ (tr1.vt tr1.op:$V4)))]>;
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index a31730e09710e6..d3e5c3c5d07846 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -654,7 +654,7 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
: MI.getOpcode() == SystemZ::WFSDB_CCPseudo ? SystemZ::SDB
: MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
: MI.getOpcode() == SystemZ::WFDDB ? SystemZ::DDB
- : MI.getOpcode() == SystemZ::WFMADB ? SystemZ::MADB
+ : MI.getOpcode() == SystemZ::WFMADB_CCPseudo ? SystemZ::MADB
: MI.getOpcode() == SystemZ::WFMSDB ? SystemZ::MSDB
: MI.getOpcode() == SystemZ::WFSQDB ? SystemZ::SQDB
: MI.getOpcode() == SystemZ::WFCDB ? SystemZ::CDB
@@ -668,7 +668,7 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
: MI.getOpcode() == SystemZ::WFSSB_CCPseudo ? SystemZ::SEB
: MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
: MI.getOpcode() == SystemZ::WFDSB ? SystemZ::DEB
- : MI.getOpcode() == SystemZ::WFMASB ? SystemZ::MAEB
+ : MI.getOpcode() == SystemZ::WFMASB_CCPseudo ? SystemZ::MAEB
: MI.getOpcode() == SystemZ::WFMSSB ? SystemZ::MSEB
: MI.getOpcode() == SystemZ::WFSQSB ? SystemZ::SQEB
: MI.getOpcode() == SystemZ::WFCSB ? SystemZ::CEB
@@ -1142,8 +1142,8 @@ bool SystemZInstrInfo::IsReassociableFMA(const MachineInstr *MI) const {
case SystemZ::VFMADB:
case SystemZ::VFMASB:
case SystemZ::WFMAXB:
- case SystemZ::WFMADB:
- case SystemZ::WFMASB:
+ case SystemZ::WFMADB_CCPseudo:
+ case SystemZ::WFMASB_CCPseudo:
return hasReassocFlags(MI);
default:
break;
@@ -1352,9 +1352,9 @@ static void getSplitFMAOpcodes(unsigned FMAOpc, unsigned &AddOpc,
case SystemZ::VFMADB: AddOpc = SystemZ::VFADB; MulOpc = SystemZ::VFMDB; break;
case SystemZ::VFMASB: AddOpc = SystemZ::VFASB; MulOpc = SystemZ::VFMSB; break;
case SystemZ::WFMAXB: AddOpc = SystemZ::WFAXB; MulOpc = SystemZ::WFMXB; break;
- case SystemZ::WFMADB:
+ case SystemZ::WFMADB_CCPseudo:
AddOpc = SystemZ::WFADB_CCPseudo; MulOpc = SystemZ::WFMDB; break;
- case SystemZ::WFMASB:
+ case SystemZ::WFMASB_CCPseudo:
AddOpc = SystemZ::WFASB_CCPseudo; MulOpc = SystemZ::WFMSB; break;
default:
llvm_unreachable("Expected FMA opcode.");
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 3ec2db2610a309..476a9b1dfed83b 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1286,17 +1286,20 @@ let Predicates = [FeatureVector] in {
}
}
- // Multiply and add.
+ // Multiply and add. 64/32-bit may participate in reassociation during
+ // machine-combining together with additions. Pretend that they clobber CC
+ // so that an Add that gets pulled down to its place can later be folded to
+ // a reg/mem, which clobber CC (while MADB/MAEB do not).
let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
- def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
- def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, any_fma, v128db, v128db, 0, 3>;
- def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, any_fma, v64db, v64db, 8, 3,
- "madbr">;
+ def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
+ def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, any_fma, v128db, v128db, 0, 3>;
+ defm WFMADB : TernaryVRReAndCCPseudo<"wfmadb", 0xE78F, any_fma, v64db, v64db,
+ 8, 3, "madbr">;
let Predicates = [FeatureVectorEnhancements1] in {
- def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, any_fma, v128sb, v128sb, 0, 2>;
- def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, any_fma, v32sb, v32sb, 8, 2,
- "maebr">;
- def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, any_fma, v128xb, v128xb, 8, 4>;
+ def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, any_fma, v128sb, v128sb, 0, 2>;
+ defm WFMASB : TernaryVRReAndCCPseudo<"wfmasb", 0xE78F, any_fma, v32sb, v32sb,
+ 8, 2, "maebr">;
+ def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, any_fma, v128xb, v128xb, 8, 4>;
}
}
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index 5975d884144a1a..431e916523dae3 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -1352,7 +1352,7 @@ def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMDB$")>;
def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFMDB$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)DB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(A|S)DB(_CCPseudo)?$")>;
// Divide / square root
def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index f18d304d7a8a37..652ab3ea932c5f 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -1401,9 +1401,9 @@ def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMSB$")>;
def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)$")>;
def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB(_CCPseudo)?$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB(_CCPseudo)?$")>;
def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
// Divide / square root
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 52fa35c5038ccf..d53e4d4b97219f 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -1443,9 +1443,9 @@ def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>;
def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB(_CCPseudo)?$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB(_CCPseudo)?$")>;
def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
// Divide / square root
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index 975671d1a24436..42628cea69555f 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -1449,9 +1449,9 @@ def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>;
def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB(_CCPseudo)?$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB(_CCPseudo)?$")>;
def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
// Divide / square root
diff --git a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll
index 4fcefaefd0f324..062bc1ba4042e0 100644
--- a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll
@@ -7,10 +7,10 @@ define void @f0(float %A, ptr %src, ptr %dst) {
; CHECK-LABEL: bb.0 (%ir-block.0):
; CHECK: %3:vr32bit = VL32 [[ADDR1:%[0-9]+:addr64bit]], 4, $noreg :: (load (s32) from %ir.arrayidx1)
; CHECK-NEXT: %4:vr32bit = VL32 %1:addr64bit, 8, $noreg :: (load (s32) from %ir.arrayidx2)
-; CHECK-NEXT: vr32bit = contract nofpexcept WFMASB killed %3:vr32bit, killed %4:vr32bit, %0:fp32bit, implicit $fpc
+; CHECK-NEXT: vr32bit = contract nofpexcept WFMASB_CCPseudo killed %3:vr32bit, killed %4:vr32bit, %0:fp32bit, implicit-def dead $cc, implicit $fpc
; CHECK: %6:vr32bit = VL32 %1:addr64bit, 12, $noreg :: (load (s32) from %ir.arrayidx3)
; CHECK-NEXT: %7:vr32bit = VL32 %1:addr64bit, 16, $noreg :: (load (s32) from %ir.arrayidx4)
-; CHECK-NEXT: %8:vr32bit = contract nofpexcept WFMASB %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit $fpc
+; CHECK-NEXT: %8:vr32bit = contract nofpexcept WFMASB_CCPseudo %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit-def dead $cc, implicit $fpc
; CHECK-NEXT: VST32 killed %8:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
; CHECK-NEXT: VST32 %6:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
; CHECK-NEXT: VST32 %7:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
@@ -19,7 +19,7 @@ define void @f0(float %A, ptr %src, ptr %dst) {
; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
; CHECK-LABEL: bb.0 (%ir-block.0):
; CHECK: fp32bit = nofpexcept MAEB %0:fp32bit(tied-def 0), killed %4:fp32bit, [[ADDR1]], 4, $noreg, implicit $fpc :: (load (s32) from %ir.arrayidx1)
-; CHECK: vr32bit = contract nofpexcept WFMASB %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit $fpc
+; CHECK: vr32bit = contract nofpexcept WFMASB_CCPseudo %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit-def dead $cc, implicit $fpc
%arrayidx1 = getelementptr inbounds float, ptr %src, i64 1
%arrayidx2 = getelementptr inbounds float, ptr %src, i64 2
@@ -48,10 +48,10 @@ define void @f1(double %A, ptr %src, ptr %dst) {
; CHECK-LABEL: bb.0 (%ir-block.0):
; CHECK: %3:vr64bit = VL64 [[ADDR1:%[0-9]+:addr64bit]], 8, $noreg :: (load (s64) from %ir.arrayidx1)
; CHECK-NEXT: %4:vr64bit = VL64 %1:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: vr64bit = contract nofpexcept WFMADB killed %3:vr64bit, killed %4:vr64bit, %0:fp64bit, implicit $fpc
+; CHECK-NEXT: vr64bit = contract nofpexcept WFMADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit, %0:fp64bit, implicit-def dead $cc, implicit $fpc
; CHECK: %6:vr64bit = VL64 %1:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx3)
; CHECK-NEXT: %7:vr64bit = VL64 %1:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: %8:vr64bit = contract nofpexcept WFMADB %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit $fpc
+; CHECK-NEXT: %8:vr64bit = contract nofpexcept WFMADB_CCPseudo %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit-def dead $cc, implicit $fpc
; CHECK-NEXT: VST64 killed %8:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
; CHECK-NEXT: VST64 %6:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
; CHECK-NEXT: VST64 %7:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
@@ -60,7 +60,7 @@ define void @f1(double %A, ptr %src, ptr %dst) {
; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
; CHECK-LABEL: bb.0 (%ir-block.0):
; CHECK: fp64bit = nofpexcept MADB %0:fp64bit(tied-def 0), killed %4:fp64bit, [[ADDR1]], 8, $noreg, implicit $fpc :: (load (s64) from %ir.arrayidx1)
-; CHECK: vr64bit = contract nofpexcept WFMADB %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit $fpc
+; CHECK: vr64bit = contract nofpexcept WFMADB_CCPseudo %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit-def dead $cc, implicit $fpc
%arrayidx1 = getelementptr inbounds double, ptr %src, i64 1
%arrayidx2 = getelementptr inbounds double, ptr %src, i64 2
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
index 787f6c90c29e45..c45bc19f3d926d 100644
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
@@ -17,15 +17,15 @@ define double @fun0_fma2_add(ptr %x, double %A, double %B) {
; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo [[X]], [[Y]]
-; CHECK-NEXT: %8:vr64bit = {{.*}} WFMADB killed [[M21:%3:vr64bit]], killed [[M22:%4:vr64bit]], killed %7:vr64bit
-; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB killed [[M31:%5:vr64bit]], killed [[M32:%6:vr64bit]], killed %8:vr64bit
+; CHECK-NEXT: %8:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21:%3:vr64bit]], killed [[M22:%4:vr64bit]], killed %7:vr64bit
+; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M31:%5:vr64bit]], killed [[M32:%6:vr64bit]], killed %8:vr64bit
; CHECK-NEXT: $f0d = COPY %9:vr64bit
; CHECK-NEXT: Return implicit $f0d
; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
; CHECK-NEXT: # Machine code for function fun0_fma2_add: IsSSA, TracksLiveness
-; CHECK: %10:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], [[X]]
-; CHECK-NEXT: %11:vr64bit = {{.*}} WFMADB killed [[M31]], killed [[M32]], [[Y]]
+; CHECK: %10:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[X]]
+; CHECK-NEXT: %11:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M31]], killed [[M32]], [[Y]]
; CHECK-NEXT: %9:vr64bit = {{.*}} WFADB_CCPseudo %10:vr64bit, %11:vr64bit
; CHECK-NEXT: $f0d = COPY %9:vr64bit
; CHECK-NEXT: Return implicit $f0d
@@ -65,8 +65,8 @@ define double @fun1_fma2_add_divop(ptr %x, double %A, double %B) {
; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
; CHECK-NEXT: %7:vr64bit = nofpexcept WFDDB %5:vr64bit, killed %6:vr64bit, implicit $fpc
; CHECK-NEXT: %8:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit
-; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB killed %3:vr64bit, killed %4:vr64bit, killed %8:vr64bit
-; CHECK-NEXT: %10:vr64bit = {{.*}} WFMADB %5:vr64bit, killed %7:vr64bit, killed %9:vr64bit
+; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit, killed %8:vr64bit
+; CHECK-NEXT: %10:vr64bit = {{.*}} WFMADB_CCPseudo %5:vr64bit, killed %7:vr64bit, killed %9:vr64bit
; CHECK-NEXT: $f0d = COPY %10:vr64bit
; CHECK-NEXT: Return implicit $f0d
entry:
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
index 10a671e0c68060..fd6a41f2a717e4 100644
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
@@ -17,15 +17,15 @@ define double @fun0_fma2_divop(ptr %x) {
; CHECK-NEXT: [[M11:%3:vr64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
; CHECK-NEXT: [[M12:%4:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB %3:vr64bit, %4:vr64bit, implicit $fpc
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], killed [[DIV]]
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB [[M11]], [[M12]], killed %6:vr64bit
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed [[DIV]]
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo [[M11]], [[M12]], killed %6:vr64bit
; CHECK-NEXT: $f0d = COPY %7:vr64bit
; CHECK-NEXT: Return implicit $f0d
; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
; CHECK-NEXT: # Machine code for function fun0_fma2_divop: IsSSA, TracksLiveness
; CHECK: %8:vr64bit = {{.*}} WFMDB killed [[M21]], killed [[M22]]
-; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB [[M11]], [[M12]], %8:vr64bit
+; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo [[M11]], [[M12]], %8:vr64bit
; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo killed [[DIV]], %9:vr64bit
entry:
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
@@ -59,8 +59,8 @@ define double @fun1_fma2(ptr %x, double %Arg) {
; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB killed %4:vr64bit, killed %5:vr64bit, killed %6:vr64bit
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %6:vr64bit
; CHECK-NEXT: $f0d = COPY %7:vr64bit
; CHECK-NEXT: Return implicit $f0d
entry:
@@ -94,13 +94,13 @@ define double @fun2_fma2(ptr %x) {
; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB %3:vr64bit, %4:vr64bit, implicit $fpc
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB killed %1:vr64bit, killed [[DIV]], killed %2:vr64bit
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB %3:vr64bit, %4:vr64bit, killed %6:vr64bit
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed %1:vr64bit, killed [[DIV]], killed %2:vr64bit
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo %3:vr64bit, %4:vr64bit, killed %6:vr64bit
; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
; CHECK-NEXT: # Machine code for function fun2_fma2: IsSSA, TracksLiveness
-; CHECK: %12:vr64bit = {{.*}} WFMADB %3:vr64bit, %4:vr64bit, killed %2:vr64bit
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB killed %1:vr64bit, killed [[DIV]], %12:vr64bit
+; CHECK: %12:vr64bit = {{.*}} WFMADB_CCPseudo %3:vr64bit, %4:vr64bit, killed %2:vr64bit
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed %1:vr64bit, killed [[DIV]], %12:vr64bit
entry:
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
index 5db80a465da5fe..1167dbfd06c702 100644
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
@@ -16,7 +16,7 @@ define double @fun0_fma1add(ptr %x) {
; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
; CHECK-NEXT: %5:vr64bit = {{.*}} WFADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB killed %1:vr64bit, killed %2:vr64bit, killed %5:vr64bit
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed %1:vr64bit, killed %2:vr64bit, killed %5:vr64bit
; CHECK-NEXT: $f0d = COPY %6:vr64bit
; CHECK-NEXT: Return implicit $f0d
entry:
@@ -50,13 +50,13 @@ define double @fun1_fma1add_divop(ptr %x) {
; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB [[T1]], killed %4:vr64bit, implicit $fpc
; CHECK-NEXT: %6:vr64bit = {{.*}} WFADB_CCPseudo [[T1]], killed [[DIV]]
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], killed %6:vr64bit
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %6:vr64bit
; CHECK-NEXT: $f0d = COPY %7:vr64bit
; CHECK-NEXT: Return implicit $f0d
; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
; CHECK-NEXT: # Machine code for function fun1_fma1add_divop: IsSSA, TracksLiveness
-; CHECK: %8:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], [[T1]]
+; CHECK: %8:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[T1]]
; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo %8:vr64bit, killed [[DIV]]
entry:
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
@@ -90,11 +90,11 @@ define double @fun2_fma1add_divop(ptr %x) {
; CHECK-NEXT: [[T2:%4:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB killed %3:vr64bit, %4:vr64bit, implicit $fpc
; CHECK-NEXT: %6:vr64bit = {{.*}} WFADB_CCPseudo killed [[DIV]], [[T2]]
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], killed %6:vr64bit
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %6:vr64bit
; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
; CHECK-NEXT: # Machine code for function fun2_fma1add_divop: IsSSA, TracksLiveness
-; CHECK: %9:vr64bit = {{.*}} WFMADB killed [[M21]], killed [[M22]], [[T2]]
+; CHECK: %9:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[T2]]
; CHECK: %7:vr64bit = {{.*}} WFADB_CCPseudo %9:vr64bit, killed [[DIV]]
entry:
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
index 885047b67c621f..9a8fa90ef70b64 100644
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
@@ -33,14 +33,14 @@ define double @fun_fma8(ptr %x, double %A) {
; CHECK-NEXT: %15:vr64bit = VL64 %0:addr64bit, 104, $noreg :: (load (s64) from %ir.arrayidx24)
; CHECK-NEXT: %16:vr64bit = VL64 %0:addr64bit, 112, $noreg :: (load (s64) from %ir.arrayidx26)
; CHECK-NEXT: %17:vr64bit = VL64 %0:addr64bit, 120, $noreg :: (load (s64) from %ir.arrayidx28)
-; CHECK-NEXT: %18:vr64bit = {{.*}} WFMADB killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
-; CHECK-NEXT: %19:vr64bit = {{.*}} WFMADB killed %4:vr64bit, killed %5:vr64bit, killed %18:vr64bit
-; CHECK-NEXT: %20:vr64bit = {{.*}} WFMADB killed %6:vr64bit, killed %7:vr64bit, killed %19:vr64bit
-; CHECK-NEXT: %21:vr64bit = {{.*}} WFMADB killed %8:vr64bit, killed %9:vr64bit, killed %20:vr64bit
-; CHECK-NEXT: %22:vr64bit = {{.*}} WFMADB killed %10:vr64bit, killed %11:vr64bit, killed %21:vr64bit
-; CHECK-NEXT: %23:vr64bit = {{.*}} WFMADB killed %12:vr64bit, killed %13:vr64bit, killed %22:vr64bit
-; CHECK-NEXT: %24:vr64bit = {{.*}} WFMADB killed %14:vr64bit, killed %15:vr64bit, killed %23:vr64bit
-; CHECK-NEXT: %25:vr64bit = {{.*}} WFMADB killed %16:vr64bit, killed %17:vr64bit, killed %24:vr64bit
+; CHECK-NEXT: %18:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
+; CHECK-NEXT: %19:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %18:vr64bit
+; CHECK-NEXT: %20:vr64bit = {{.*}} WFMADB_CCPseudo killed %6:vr64bit, killed %7:vr64bit, killed %19:vr64bit
+; CHECK-NEXT: %21:vr64bit = {{.*}} WFMADB_CCPseudo killed %8:vr64bit, killed %9:vr64bit, killed %20:vr64bit
+; CHECK-NEXT: %22:vr64bit = {{.*}} WFMADB_CCPseudo killed %10:vr64bit, killed %11:vr64bit, killed %21:vr64bit
+; CHECK-NEXT: %23:vr64bit = {{.*}} WFMADB_CCPseudo killed %12:vr64bit, killed %13:vr64bit, killed %22:vr64bit
+; CHECK-NEXT: %24:vr64bit = {{.*}} WFMADB_CCPseudo killed %14:vr64bit, killed %15:vr64bit, killed %23:vr64bit
+; CHECK-NEXT: %25:vr64bit = {{.*}} WFMADB_CCPseudo killed %16:vr64bit, killed %17:vr64bit, killed %24:vr64bit
; CHECK-NEXT: $f0d = COPY %25:vr64bit
; CHECK-NEXT: Return implicit $f0d
@@ -49,7 +49,7 @@ define double @fun_fma8(ptr %x, double %A) {
; CHECK-NEXT: add pattern FMA2_P0P1
; CHECK-NEXT: add pattern FMA2
; CHECK: reassociating using pattern FMA_P1P0
-; CHECK: Dependence data for %21:vr64bit = {{.*}} WFMADB
+; CHECK: Dependence data for %21:vr64bit = {{.*}} WFMADB_CCPseudo
; CHECK-NEXT: NewRootDepth: 16 RootDepth: 22 It MustReduceDepth and it does it
; CHECK-NEXT: Resource length before replacement: 16 and after: 16
; CHECK-NEXT: As result it IMPROVES/PRESERVES Resource Length
@@ -57,67 +57,67 @@ define double @fun_fma8(ptr %x, double %A) {
; CHECK-NEXT: add pattern FMA2_P0P1
; CHECK-NEXT: add pattern FMA2
; CHECK-NEXT: reassociating using pattern FMA_P1P0
-; CHECK-NEXT: Dependence data for %23:vr64bit = {{.*}} WFMADB
+; CHECK-NEXT: Dependence data for %23:vr64bit = {{.*}} WFMADB_CCPseudo
; CHECK-NEXT: NewRootDepth: 22 RootDepth: 28 It MustReduceDepth and it does it
; CHECK: Resource length before replacement: 16 and after: 16
; CHECK-NEXT: As result it IMPROVES/PRESERVES Resource Length
; CHECK-NEXT: add pattern FMA1_Add_L
; CHECK-NEXT: add pattern FMA1_Add_R
; CHECK-NEXT: reassociating using pattern FMA1_Add_L
-; CHECK-NEXT: Dependence data for %24:vr64bit = {{.*}} WFMADB
+; CHECK-NEXT: Dependence data for %24:vr64bit = {{.*}} WFMADB_CCPseudo
; CHECK-NEXT: NewRootDepth: 28 RootDepth: 28 It MustReduceDepth but it does NOT do it
; CHECK-NEXT: reassociating using pattern FMA1_Add_R
-; CHECK-NEXT: Dependence data for %24:vr64bit = {{.*}} WFMADB
+; CHECK-NEXT: Dependence data for %24:vr64bit = {{.*}} WFMADB_CCPseudo
; CHECK-NEXT: NewRootDepth: 22 RootDepth: 28 It MustReduceDepth and it does it
; CHECK-NEXT: Resource length before replacement: 16 and after: 16
; CHECK-NEXT: As result it IMPROVES/PRESERVES Resource Length
; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; CHECK: %18:vr64bit = {{.*}} WFMADB killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
-; CHECK-NEXT: %19:vr64bit = {{.*}} WFMADB killed %4:vr64bit, killed %5:vr64bit, killed %18:vr64bit
+; CHECK: %18:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
+; CHECK-NEXT: %19:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %18:vr64bit
; CHECK-NEXT: %36:vr64bit = {{.*}} WFMDB killed %6:vr64bit, killed %7:vr64bit
-; CHECK-NEXT: %37:vr64bit = {{.*}} WFMADB killed %8:vr64bit, killed %9:vr64bit, %36:vr64bit
+; CHECK-NEXT: %37:vr64bit = {{.*}} WFMADB_CCPseudo killed %8:vr64bit, killed %9:vr64bit, %36:vr64bit
; CHECK-NEXT: %21:vr64bit = {{.*}} WFADB_CCPseudo killed %19:vr64bit, %37:vr64bit
; CHECK-NEXT: %40:vr64bit = {{.*}} WFMDB killed %10:vr64bit, killed %11:vr64bit
-; CHECK-NEXT: %41:vr64bit = {{.*}} WFMADB killed %12:vr64bit, killed %13:vr64bit, %40:vr64bit
-; CHECK-NEXT: %43:vr64bit = {{.*}} WFMADB killed %14:vr64bit, killed %15:vr64bit, %41:vr64bit
+; CHECK-NEXT: %41:vr64bit = {{.*}} WFMADB_CCPseudo killed %12:vr64bit, killed %13:vr64bit, %40:vr64bit
+; CHECK-NEXT: %43:vr64bit = {{.*}} WFMADB_CCPseudo killed %14:vr64bit, killed %15:vr64bit, %41:vr64bit
; CHECK-NEXT: %24:vr64bit = {{.*}} WFADB_CCPseudo %43:vr64bit, killed %21:vr64bit
-; CHECK-NEXT: %25:vr64bit = {{.*}} WFMADB killed %16:vr64bit, killed %17:vr64bit, killed %24:vr64bit
+; CHECK-NEXT: %25:vr64bit = {{.*}} WFMADB_CCPseudo killed %16:vr64bit, killed %17:vr64bit, killed %24:vr64bit
; ALT: Machine InstCombiner: fun_fma8
; ALT-NEXT: Combining MBB entry
; ALT-NEXT: add pattern FMA3
; ALT-NEXT: reassociating using pattern FMA3
-; ALT-NEXT: Dependence data for %20:vr64bit = {{.*}} WFMADB
+; ALT-NEXT: Dependence data for %20:vr64bit = {{.*}} WFMADB_CCPseudo
; ALT-NEXT: NewRootDepth: 16 RootDepth: 16 It MustReduceDepth but it does NOT do it
; ALT-NEXT: add pattern FMA3
; ALT-NEXT: reassociating using pattern FMA3
-; ALT-NEXT: Dependence data for %21:vr64bit = {{.*}} WFMADB
+; ALT-NEXT: Dependence data for %21:vr64bit = {{.*}} WFMADB_CCPseudo
; ALT-NEXT: NewRootDepth: 16 RootDepth: 22 It MustReduceDepth and it does it
; ALT-NEXT: Resource length before replacement: 16 and after: 16
; ALT-NEXT: As result it IMPROVES/PRESERVES Resource Length
; ALT-NEXT: add pattern FMA2_Add
; ALT-NEXT: reassociating using pattern FMA2_Add
-; ALT-NEXT: Dependence data for %23:vr64bit = {{.*}} WFMADB
+; ALT-NEXT: Dependence data for %23:vr64bit = {{.*}} WFMADB_CCPseudo
; ALT-NEXT: NewRootDepth: 22 RootDepth: 28 It MustReduceDepth and it does it
; ALT-NEXT: Resource length before replacement: 16 and after: 16
; ALT-NEXT: As result it IMPROVES/PRESERVES Resource Length
; ALT-NEXT: add pattern FMA2_Add
; ALT-NEXT: reassociating using pattern FMA2_Add
-; ALT-NEXT: Dependence data for %25:vr64bit = {{.*}} WFMADB
+; ALT-NEXT: Dependence data for %25:vr64bit = {{.*}} WFMADB_CCPseudo
; ALT-NEXT: NewRootDepth: 28 RootDepth: 34 It MustReduceDepth and it does it
; ALT-NEXT: Resource length before replacement: 16 and after: 16
; ALT-NEXT: As result it IMPROVES/PRESERVES Resource Length
; ALT: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; ALT: %18:vr64bit = {{.*}} WFMADB killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
+; ALT: %18:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
; ALT-NEXT: %29:vr64bit = {{.*}} WFMDB killed %4:vr64bit, killed %5:vr64bit
-; ALT-NEXT: %30:vr64bit = {{.*}} WFMADB killed %6:vr64bit, killed %7:vr64bit, killed %18:vr64bit
-; ALT-NEXT: %31:vr64bit = {{.*}} WFMADB killed %8:vr64bit, killed %9:vr64bit, %29:vr64bit
-; ALT-NEXT: %32:vr64bit = {{.*}} WFMADB killed %10:vr64bit, killed %11:vr64bit, %30:vr64bit
-; ALT-NEXT: %33:vr64bit = {{.*}} WFMADB killed %12:vr64bit, killed %13:vr64bit, %31:vr64bit
-; ALT-NEXT: %34:vr64bit = {{.*}} WFMADB killed %14:vr64bit, killed %15:vr64bit, %32:vr64bit
-; ALT-NEXT: %35:vr64bit = {{.*}} WFMADB killed %16:vr64bit, killed %17:vr64bit, %33:vr64bit
+; ALT-NEXT: %30:vr64bit = {{.*}} WFMADB_CCPseudo killed %6:vr64bit, killed %7:vr64bit, killed %18:vr64bit
+; ALT-NEXT: %31:vr64bit = {{.*}} WFMADB_CCPseudo killed %8:vr64bit, killed %9:vr64bit, %29:vr64bit
+; ALT-NEXT: %32:vr64bit = {{.*}} WFMADB_CCPseudo killed %10:vr64bit, killed %11:vr64bit, %30:vr64bit
+; ALT-NEXT: %33:vr64bit = {{.*}} WFMADB_CCPseudo killed %12:vr64bit, killed %13:vr64bit, %31:vr64bit
+; ALT-NEXT: %34:vr64bit = {{.*}} WFMADB_CCPseudo killed %14:vr64bit, killed %15:vr64bit, %32:vr64bit
+; ALT-NEXT: %35:vr64bit = {{.*}} WFMADB_CCPseudo killed %16:vr64bit, killed %17:vr64bit, %33:vr64bit
; ALT-NEXT: %25:vr64bit = {{.*}} WFADB_CCPseudo %34:vr64bit, %35:vr64bit
entry:
>From 688aa0a72b88e1cff04929cbffd69132f3c626e3 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 2 Apr 2024 17:41:42 +0200
Subject: [PATCH 4/9] Rebase IP
---
llvm/lib/CodeGen/PeepholeOptimizer.cpp | 1 -
llvm/lib/Target/SystemZ/CMakeLists.txt | 2 +-
llvm/lib/Target/SystemZ/SystemZ.h | 4 +-
...m.cpp => SystemZFinalizeReassociation.cpp} | 54 ++-
.../Target/SystemZ/SystemZISelDAGToDAG.cpp | 15 +
.../Target/SystemZ/SystemZISelLowering.cpp | 4 -
llvm/lib/Target/SystemZ/SystemZInstrFP.td | 47 +-
.../lib/Target/SystemZ/SystemZInstrFormats.td | 20 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 66 +--
llvm/lib/Target/SystemZ/SystemZInstrInfo.h | 1 -
llvm/lib/Target/SystemZ/SystemZInstrVector.td | 24 +-
llvm/lib/Target/SystemZ/SystemZOperators.td | 43 +-
.../Target/SystemZ/SystemZTargetMachine.cpp | 2 +-
.../CodeGen/SystemZ/fp-regmem-folding-01.ll | 251 ----------
.../CodeGen/SystemZ/fp-regmem-folding-02.ll | 164 -------
.../CodeGen/SystemZ/fp-regmem-folding-03.ll | 86 ----
.../CodeGen/SystemZ/fp-regmem-folding-04.ll | 62 ---
.../SystemZ/machine-combiner-reassoc-fp-01.ll | 443 ++++++++----------
.../SystemZ/machine-combiner-reassoc-fp-03.ll | 16 +-
.../SystemZ/machine-combiner-reassoc-fp-04.ll | 3 +-
.../SystemZ/machine-combiner-reassoc-fp-08.ll | 18 +-
.../SystemZ/machine-combiner-reassoc-fp-09.ll | 2 -
22 files changed, 362 insertions(+), 966 deletions(-)
rename llvm/lib/Target/SystemZ/{SystemZFinalizeRegMem.cpp => SystemZFinalizeReassociation.cpp} (52%)
delete mode 100644 llvm/test/CodeGen/SystemZ/fp-regmem-folding-01.ll
delete mode 100644 llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll
delete mode 100644 llvm/test/CodeGen/SystemZ/fp-regmem-folding-03.ll
delete mode 100644 llvm/test/CodeGen/SystemZ/fp-regmem-folding-04.ll
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 5cd5962d4701e4..477a86dbe3f8c4 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1868,7 +1868,6 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
// If we run into an instruction we can't fold across, discard
// the load candidates. Note: We might be able to fold *into* this
// instruction, so this needs to be after the folding logic.
- // TODO: Try AA for a store?
if (MI->isLoadFoldBarrier()) {
LLVM_DEBUG(dbgs() << "Encountered load fold barrier on " << *MI);
FoldAsLoadDefCandidates.clear();
diff --git a/llvm/lib/Target/SystemZ/CMakeLists.txt b/llvm/lib/Target/SystemZ/CMakeLists.txt
index 6ab5d50c576ceb..b8f07d1222722c 100644
--- a/llvm/lib/Target/SystemZ/CMakeLists.txt
+++ b/llvm/lib/Target/SystemZ/CMakeLists.txt
@@ -20,7 +20,7 @@ add_llvm_target(SystemZCodeGen
SystemZConstantPoolValue.cpp
SystemZCopyPhysRegs.cpp
SystemZElimCompare.cpp
- SystemZFinalizeRegMem.cpp
+ SystemZFinalizeReassociation.cpp
SystemZFrameLowering.cpp
SystemZHazardRecognizer.cpp
SystemZISelDAGToDAG.cpp
diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h
index c357c5a4250144..49a200babfff57 100644
--- a/llvm/lib/Target/SystemZ/SystemZ.h
+++ b/llvm/lib/Target/SystemZ/SystemZ.h
@@ -195,14 +195,14 @@ FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM);
-FunctionPass *createSystemZFinalizeRegMemPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZFinalizeReassociationPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM);
FunctionPass *createSystemZTDCPass();
void initializeSystemZCopyPhysRegsPass(PassRegistry &);
void initializeSystemZDAGToDAGISelPass(PassRegistry &);
void initializeSystemZElimComparePass(PassRegistry &);
-void initializeSystemZFinalizeRegMemPass(PassRegistry &);
+void initializeSystemZFinalizeReassociationPass(PassRegistry &);
void initializeSystemZLDCleanupPass(PassRegistry &);
void initializeSystemZLongBranchPass(PassRegistry &);
void initializeSystemZPostRewritePass(PassRegistry &);
diff --git a/llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp b/llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp
similarity index 52%
rename from llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp
rename to llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp
index 68b2797b114e4b..2b5b66afa317b5 100644
--- a/llvm/lib/Target/SystemZ/SystemZFinalizeRegMem.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp
@@ -1,4 +1,4 @@
-//===------- SystemZFinalizeRegMem.cpp - Finalize FP reg/mem folding ------===//
+//===---- SystemZFinalizeReassociation.cpp - Finalize FP reassociation ----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,32 +6,36 @@
//
//===----------------------------------------------------------------------===//
//
-// This pass converts any remaining reg/reg pseudos into the real target
-// instruction in cases where the peephole optimizer did not fold a load into
-// a reg/mem instruction.
+// This pass is the last step of the process of enabling reassociation with
+// the MachineCombiner. These are the steps involved:
+//
+// 1. Instruction selection: Disable reg/mem folding for any operations that
+// are reassociable since MachineCombiner will not succeed otherwise.
+// Select a reg/reg pseudo that pretends to clobber CC since the reg/mem
+// opcode clobbers it.
+//
+// 2. MachineCombiner: Performs reassociation with the reg/reg instructions.
+//
+// 3. PeepholeOptimizer: Fold loads into reg/mem instructions.
+//
+// 4. This pass: Convert any remaining reg/reg pseudos.
//
//===----------------------------------------------------------------------===//
-#include "SystemZMachineFunctionInfo.h"
#include "SystemZTargetMachine.h"
-#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Target/TargetMachine.h"
using namespace llvm;
namespace {
-class SystemZFinalizeRegMem : public MachineFunctionPass {
+class SystemZFinalizeReassociation : public MachineFunctionPass {
public:
static char ID;
- SystemZFinalizeRegMem()
- : MachineFunctionPass(ID), TII(nullptr), MRI(nullptr) {
- initializeSystemZFinalizeRegMemPass(*PassRegistry::getPassRegistry());
+ SystemZFinalizeReassociation()
+ : MachineFunctionPass(ID), TII(nullptr) {
+ initializeSystemZFinalizeReassociationPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -42,27 +46,26 @@ class SystemZFinalizeRegMem : public MachineFunctionPass {
bool visitMBB(MachineBasicBlock &MBB);
const SystemZInstrInfo *TII;
- MachineRegisterInfo *MRI;
};
-char SystemZFinalizeRegMem::ID = 0;
+char SystemZFinalizeReassociation::ID = 0;
} // end anonymous namespace
-INITIALIZE_PASS(SystemZFinalizeRegMem, "systemz-finalize-regmem",
- "SystemZ Finalize RegMem", false, false)
+INITIALIZE_PASS(SystemZFinalizeReassociation, "systemz-finalize-reassoc",
+ "SystemZ Finalize Reassociation", false, false)
FunctionPass *llvm::
-createSystemZFinalizeRegMemPass(SystemZTargetMachine &TM) {
- return new SystemZFinalizeRegMem();
+createSystemZFinalizeReassociationPass(SystemZTargetMachine &TM) {
+ return new SystemZFinalizeReassociation();
}
-void SystemZFinalizeRegMem::getAnalysisUsage(AnalysisUsage &AU) const {
+void SystemZFinalizeReassociation::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
-bool SystemZFinalizeRegMem::visitMBB(MachineBasicBlock &MBB) {
+bool SystemZFinalizeReassociation::visitMBB(MachineBasicBlock &MBB) {
bool Changed = false;
for (MachineInstr &MI : MBB) {
unsigned PseudoOpcode = MI.getOpcode();
@@ -71,8 +74,8 @@ bool SystemZFinalizeRegMem::visitMBB(MachineBasicBlock &MBB) {
: PseudoOpcode == SystemZ::WFASB_CCPseudo ? SystemZ::WFASB
: PseudoOpcode == SystemZ::WFSDB_CCPseudo ? SystemZ::WFSDB
: PseudoOpcode == SystemZ::WFSSB_CCPseudo ? SystemZ::WFSSB
- : PseudoOpcode == SystemZ::WFMADB_CCPseudo ? SystemZ::WFMADB
- : PseudoOpcode == SystemZ::WFMASB_CCPseudo ? SystemZ::WFMASB
+ : PseudoOpcode == SystemZ::WFMADB_CCPseudo ? SystemZ::WFMADB
+ : PseudoOpcode == SystemZ::WFMASB_CCPseudo ? SystemZ::WFMASB
: 0;
if (TargetOpcode) {
MI.setDesc(TII->get(TargetOpcode));
@@ -84,9 +87,8 @@ bool SystemZFinalizeRegMem::visitMBB(MachineBasicBlock &MBB) {
return Changed;
}
-bool SystemZFinalizeRegMem::runOnMachineFunction(MachineFunction &F) {
+bool SystemZFinalizeReassociation::runOnMachineFunction(MachineFunction &F) {
TII = F.getSubtarget<SystemZSubtarget>().getInstrInfo();
- MRI = &F.getRegInfo();
bool Modified = false;
for (auto &MBB : F)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index deaf3dcaeb92a4..02ee0b9e2920eb 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -350,6 +350,11 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
// Try to expand a boolean SELECT_CCMASK using an IPM sequence.
SDValue expandSelectBoolean(SDNode *Node);
+ // Return true if the flags of N and the subtarget allows for
+ // reassociation, in which case a reg/reg opcode is needed as input to the
+ // MachineCombiner.
+ bool shouldSelectForReassoc(SDNode *N) const;
+
public:
static char ID;
@@ -2044,6 +2049,16 @@ SDValue SystemZDAGToDAGISel::expandSelectBoolean(SDNode *Node) {
return Result;
}
+bool SystemZDAGToDAGISel::shouldSelectForReassoc(SDNode *N) const {
+ EVT VT = N->getValueType(0);
+ assert(VT.isFloatingPoint() && "Expected FP SDNode");
+ return N->getFlags().hasAllowReassociation() &&
+ N->getFlags().hasNoSignedZeros() &&
+ Subtarget->hasVector() &&
+ (VT != MVT::f32 || Subtarget->hasVectorEnhancements1()) &&
+ !N->isStrictFPOpcode();
+}
+
void SystemZDAGToDAGISel::PreprocessISelDAG() {
// If we have conditional immediate loads, we always prefer
// using those over an IPM sequence.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2e4e368f3d6779..2da4431cf077eb 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -692,10 +692,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
}
- // Don't select reg/mem LDEB if WLDEB is available.
- if (Subtarget.hasVector())
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
-
// Floating-point truncation and stores need to be done separately.
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 7f9ae518f7aaf2..5968fe8d744a55 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -201,7 +201,6 @@ let Predicates = [FeatureNoVectorEnhancements1] in {
// Extend memory floating-point values to wider representations.
let Uses = [FPC], mayRaiseFPException = 1 in {
def LDEB : UnaryRXE<"ldeb", 0xED04, z_any_extloadf32, FP64, 4>;
- def LDEB : UnaryRXE<"ldeb", 0xED04, z_fpr_any_extloadf32, FP64, 4>;
def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>;
def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>;
}
@@ -363,8 +362,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
def SQDBR : UnaryRRE<"sqdbr", 0xB315, any_fsqrt, FP64, FP64>;
def SQXBR : UnaryRRE<"sqxbr", 0xB316, any_fsqrt, FP128, FP128>;
- def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<any_fsqrt, z_fprload>, FP32, 4>;
- def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<any_fsqrt, z_fprload>, FP64, 8>;
+ def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<any_fsqrt>, FP32, 4>;
+ def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<any_fsqrt>, FP64, 8>;
}
// Round to an integer, with the second operand (modifier M3) specifying
@@ -431,10 +430,10 @@ let Uses = [FPC], mayRaiseFPException = 1,
def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64, FP64>;
def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
}
- defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_load, 4>;
- defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_load, 8>;
- defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_fprload, 4>;
- defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_fprload, 8>;
+ defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, z_any_fadd_noreassoc, FP32,
+ z_load, 4>;
+ defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, z_any_fadd_noreassoc, FP64,
+ z_load, 8>;
}
// Subtraction.
@@ -444,10 +443,10 @@ let Uses = [FPC], mayRaiseFPException = 1,
def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64, FP64>;
def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;
- defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, any_fsub, FP32, z_load, 4>;
- defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, any_fsub, FP64, z_load, 8>;
- defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, any_fsub, FP32, z_fprload, 4>;
- defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, any_fsub, FP64, z_fprload, 8>;
+ defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, z_any_fsub_noreassoc, FP32,
+ z_load, 4>;
+ defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, z_any_fsub_noreassoc, FP64,
+ z_load, 8>;
}
// Multiplication.
@@ -457,10 +456,10 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
def MDBR : BinaryRRE<"mdbr", 0xB31C, any_fmul, FP64, FP64>;
def MXBR : BinaryRRE<"mxbr", 0xB34C, any_fmul, FP128, FP128>;
}
- defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_load, 4>;
- defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, any_fmul, FP64, z_load, 8>;
- defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_fprload, 4>;
- defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, any_fmul, FP64, z_fprload, 8>;
+ defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, z_any_fmul_noreassoc, FP32,
+ z_load, 4>;
+ defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, z_any_fmul_noreassoc, FP64,
+ z_load, 8>;
}
// f64 multiplication of two FP32 registers.
@@ -502,12 +501,10 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;
- defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, z_load, 4>;
- defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, z_load, 8>;
- defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32,
- z_fprload, 4>;
- defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64,
- z_fprload, 8>;
+ defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma_noreassoc, FP32,
+ FP32, z_load, 4>;
+ defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma_noreassoc, FP64,
+ FP64, z_load, 8>;
}
// Fused multiply-subtract.
@@ -517,10 +514,6 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, z_load, 4>;
defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, z_load, 8>;
- defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32,
- z_fprload, 4>;
- defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64,
- z_fprload, 8>;
}
// Division.
@@ -531,8 +524,6 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, z_load, 4>;
defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, z_load, 8>;
- defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, z_fprload, 4>;
- defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, z_fprload, 8>;
}
// Divide to integer.
@@ -552,8 +543,6 @@ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC], CCValues = 0xF in {
def CEB : CompareRXE<"ceb", 0xED09, z_any_fcmp, FP32, z_load, 4>;
def CDB : CompareRXE<"cdb", 0xED19, z_any_fcmp, FP64, z_load, 8>;
- def CEB : CompareRXE<"ceb", 0xED09, z_any_fcmp, FP32, z_fprload, 4>;
- def CDB : CompareRXE<"cdb", 0xED19, z_any_fcmp, FP64, z_fprload, 8>;
def KEBR : CompareRRE<"kebr", 0xB308, z_strict_fcmps, FP32, FP32>;
def KDBR : CompareRRE<"kdbr", 0xB318, z_strict_fcmps, FP64, FP64>;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 5a7be17c4dc1f6..a776d5d4dad490 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5539,29 +5539,31 @@ multiclass StringRRE<string mnemonic, bits<16> opcode,
multiclass BinaryVRRcAndCCPseudo<string mnemonic, bits<16> opcode,
SDPatternOperator operator,
+ SDPatternOperator reassoc_operator,
TypedReg tr1, TypedReg tr2, bits<4> type = 0,
bits<4> m5 = 0, bits<4> m6 = 0,
string fp_mnemonic = ""> {
- def "" : BinaryVRRc<mnemonic, opcode, null_frag, tr1, tr2, type, m5, m6,
+ def "" : BinaryVRRc<mnemonic, opcode, operator, tr1, tr2, type, m5, m6,
fp_mnemonic>;
- let Defs = [CC] in
+ let Defs = [CC], AddedComplexity = 1 in // Win over "".
def _CCPseudo : Pseudo<(outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
[(set (tr1.vt tr1.op:$V1),
- (operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3)))]>;
+ (reassoc_operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3)))]>;
}
multiclass TernaryVRReAndCCPseudo<string mnemonic, bits<16> opcode,
SDPatternOperator operator,
+ SDPatternOperator reassoc_operator,
TypedReg tr1, TypedReg tr2, bits<4> m5 = 0,
bits<4> type = 0, string fp_mnemonic = ""> {
- def "" : TernaryVRRe<mnemonic, opcode, null_frag, tr1, tr2, m5, type,
+ def "" : TernaryVRRe<mnemonic, opcode, operator, tr1, tr2, m5, type,
fp_mnemonic>;
- let Defs = [CC] in
+ let Defs = [CC], AddedComplexity = 1 in // Win over "".
def _CCPseudo : Pseudo<(outs tr1.op:$V1),
(ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
[(set (tr1.vt tr1.op:$V1),
- (operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3),
- (tr1.vt tr1.op:$V4)))]>;
+ (reassoc_operator (tr2.vt tr2.op:$V2),
+ (tr2.vt tr2.op:$V3),
+ (tr1.vt tr1.op:$V4)))]>;
}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index d3e5c3c5d07846..ff47b56447fbab 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -632,7 +632,6 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
- // TODO: Would it be beneficial to not fold in cases of high register pressure?
if (DISABLE_FOLDING)
return nullptr;
@@ -646,6 +645,9 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
!MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
return nullptr;
+ // For reassociable FP operations, any loads have been purposefully left
+ // unfolded so that MachineCombiner can do its work on reg/reg
+ // opcodes. After that, as many loads as possible are now folded.
unsigned LoadOpcD12 = 0;
unsigned LoadOpcD20 = 0;
unsigned RegMemOpcode = 0;
@@ -653,11 +655,7 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
RegMemOpcode = MI.getOpcode() == SystemZ::WFADB_CCPseudo ? SystemZ::ADB
: MI.getOpcode() == SystemZ::WFSDB_CCPseudo ? SystemZ::SDB
: MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
- : MI.getOpcode() == SystemZ::WFDDB ? SystemZ::DDB
: MI.getOpcode() == SystemZ::WFMADB_CCPseudo ? SystemZ::MADB
- : MI.getOpcode() == SystemZ::WFMSDB ? SystemZ::MSDB
- : MI.getOpcode() == SystemZ::WFSQDB ? SystemZ::SQDB
- : MI.getOpcode() == SystemZ::WFCDB ? SystemZ::CDB
: 0;
if (RegMemOpcode) {
LoadOpcD12 = SystemZ::VL64;
@@ -667,11 +665,7 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
RegMemOpcode = MI.getOpcode() == SystemZ::WFASB_CCPseudo ? SystemZ::AEB
: MI.getOpcode() == SystemZ::WFSSB_CCPseudo ? SystemZ::SEB
: MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
- : MI.getOpcode() == SystemZ::WFDSB ? SystemZ::DEB
: MI.getOpcode() == SystemZ::WFMASB_CCPseudo ? SystemZ::MAEB
- : MI.getOpcode() == SystemZ::WFMSSB ? SystemZ::MSEB
- : MI.getOpcode() == SystemZ::WFSQSB ? SystemZ::SQEB
- : MI.getOpcode() == SystemZ::WFCSB ? SystemZ::CEB
: 0;
if (RegMemOpcode) {
LoadOpcD12 = SystemZ::VL32;
@@ -679,12 +673,6 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
FPRC = &SystemZ::FP32BitRegClass;
}
}
- if (MI.getOpcode() == SystemZ::WLDEB) {
- RegMemOpcode = SystemZ::LDEB;
- LoadOpcD12 = SystemZ::VL32;
- LoadOpcD20 = SystemZ::LEY;
- FPRC = &SystemZ::FP64BitRegClass;
- }
if (!RegMemOpcode ||
(DefMI->getOpcode() != LoadOpcD12 && DefMI->getOpcode() != LoadOpcD20))
@@ -695,39 +683,27 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
DebugLoc DL = MI.getDebugLoc();
Register DstReg = MI.getOperand(0).getReg();
-
- bool IsUnary = (RegMemOpcode == SystemZ::LDEB || RegMemOpcode == SystemZ::SQEB ||
- RegMemOpcode == SystemZ::SQDB);
+ MachineOperand LHS = MI.getOperand(1);
+ MachineOperand RHS = MI.getOperand(2);
bool IsTernary =
- (RegMemOpcode == SystemZ::MADB || RegMemOpcode == SystemZ::MAEB ||
- RegMemOpcode == SystemZ::MSDB || RegMemOpcode == SystemZ::MSEB);
- bool IsCmp = (RegMemOpcode == SystemZ::CEB ||RegMemOpcode == SystemZ::CDB);
- // (TODO: handle also strict FP compares?)
-
- MachineOperand LHS = MI.getOperand(1 - IsCmp);
- MachineOperand RHS = MI.getOperand(2 - IsCmp);
+ (RegMemOpcode == SystemZ::MADB || RegMemOpcode == SystemZ::MAEB);
MachineOperand &RegMO = RHS.getReg() == FoldAsLoadDefReg ? LHS : RHS;
MachineOperand *AccMO = IsTernary ? &MI.getOperand(3) : nullptr;
- if ((RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB ||
- RegMemOpcode == SystemZ::DDB || RegMemOpcode == SystemZ::DEB ||
- RegMemOpcode == SystemZ::CDB || RegMemOpcode == SystemZ::CEB) &&
+ if ((RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB) &&
FoldAsLoadDefReg != RHS.getReg())
return nullptr;
if (IsTernary && FoldAsLoadDefReg == AccMO->getReg())
return nullptr;
MachineInstrBuilder MIB =
- BuildMI(*MI.getParent(), MI, DL, get(RegMemOpcode));
- if (!IsCmp)
- MIB.addReg(DstReg, RegState::Define);
- if (!IsUnary) {
- if (IsTernary) {
- MIB.add(*AccMO);
- MRI->setRegClass(AccMO->getReg(), FPRC);
- }
- MIB.add(RegMO);
- MRI->setRegClass(RegMO.getReg(), FPRC);
+ BuildMI(*MI.getParent(), MI, DL, get(RegMemOpcode), DstReg);
+ MRI->setRegClass(DstReg, FPRC);
+ if (IsTernary) {
+ MIB.add(*AccMO);
+ MRI->setRegClass(AccMO->getReg(), FPRC);
}
+ MIB.add(RegMO);
+ MRI->setRegClass(RegMO.getReg(), FPRC);
MachineOperand &Base = DefMI->getOperand(1);
MachineOperand &Disp = DefMI->getOperand(2);
@@ -745,9 +721,7 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
}
MIB.addMemOperand(*DefMI->memoperands_begin());
transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);
- if (!IsCmp)
- MIB->addRegisterDead(SystemZ::CC, TRI);
- MRI->setRegClass(DstReg, FPRC);
+ MIB->addRegisterDead(SystemZ::CC, TRI);
return MIB;
}
@@ -1196,6 +1170,7 @@ bool SystemZInstrInfo::getFMAPatterns(
if (!AllOpsOK(Root))
return false;
+ // XXX Rewrite this for the patterns we want to actually use.
MachineInstr *TopAdd = nullptr;
std::vector<MachineInstr *> FMAChain;
FMAChain.push_back(&Root);
@@ -1268,6 +1243,8 @@ void SystemZInstrInfo::finalizeInsInstrs(
case SystemZ::WFASB_CCPseudo:
case SystemZ::WFSDB_CCPseudo:
case SystemZ::WFSSB_CCPseudo:
+ case SystemZ::WFMADB_CCPseudo:
+ case SystemZ::WFMASB_CCPseudo:
Inst->addRegisterDead(SystemZ::CC, TRI);
break;
default: break;
@@ -1372,7 +1349,7 @@ void SystemZInstrInfo::reassociateFMA(
const TargetRegisterClass *RC = Root.getRegClassConstraint(0, this, TRI);
Register DstReg = Root.getOperand(0).getReg();
- std::vector<MachineInstr *> Chain; // XXXXX
+ std::vector<MachineInstr *> Chain; // XXX Rework this method for final patterns used.
Chain.push_back(&Root);
uint16_t IntersectedFlags = Root.getFlags();
@@ -1426,7 +1403,7 @@ void SystemZInstrInfo::reassociateFMA(
LLVM_DEBUG(dbgs() << "reassociating using pattern FMA_P0P1\n");
Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
assert(IsAllFMA());
- getIntersectedFlags(); // XXXXXXXXXx
+ getIntersectedFlags(); // XXX Refactor (here and below)
Register NewVRA = createNewVReg(0);
Register NewVRB = createNewVReg(1);
unsigned FirstMulIdx =
@@ -1559,7 +1536,8 @@ bool
SystemZInstrInfo::accumulateInstrSeqToRootLatency(MachineInstr &Root) const {
// This doesn't make much sense for FMA patterns as they typically use an
// extra Add to do things in parallell.
- if (IsReassociableFMA(&Root)) // XXXXXXXXXXXX
+ if (IsReassociableFMA(&Root)) // XXX Fine tune this a bit depending on
+ // used patterns.
return false;
return true;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 6394a3ef925b5d..035a191491a7aa 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -319,7 +319,6 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
bool accumulateInstrSeqToRootLatency(MachineInstr &Root) const override;
- int getExtendResourceLenLimit() const override { return 0; } //XXX
// SystemZ specific version of setSpecialOperandAttr that copies Flags to
// MI and clears nuw, nsw, and exact flags.
void setSpecialOperandAttr(MachineInstr &MI, uint32_t Flags) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 476a9b1dfed83b..696edf6ce09bf7 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1063,12 +1063,12 @@ let Predicates = [FeatureVector] in {
let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>;
- defm WFADB : BinaryVRRcAndCCPseudo<"wfadb", 0xE7E3, any_fadd, v64db, v64db,
- 3, 8, 0, "adbr">;
+ defm WFADB : BinaryVRRcAndCCPseudo<"wfadb", 0xE7E3, any_fadd,
+ z_fadd_reassoc, v64db, v64db, 3, 8, 0, "adbr">;
let Predicates = [FeatureVectorEnhancements1] in {
def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>;
- defm WFASB : BinaryVRRcAndCCPseudo<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb,
- 2, 8, 0, "aebr">;
+ defm WFASB : BinaryVRRcAndCCPseudo<"wfasb", 0xE7E3, any_fadd,
+ z_fadd_reassoc, v32sb, v32sb, 2, 8, 0, "aebr">;
def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>;
}
}
@@ -1293,12 +1293,12 @@ let Predicates = [FeatureVector] in {
let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, any_fma, v128db, v128db, 0, 3>;
- defm WFMADB : TernaryVRReAndCCPseudo<"wfmadb", 0xE78F, any_fma, v64db, v64db,
- 8, 3, "madbr">;
+ defm WFMADB : TernaryVRReAndCCPseudo<"wfmadb", 0xE78F, any_fma,
+ z_fma_reassoc, v64db, v64db, 8, 3, "madbr">;
let Predicates = [FeatureVectorEnhancements1] in {
def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, any_fma, v128sb, v128sb, 0, 2>;
- defm WFMASB : TernaryVRReAndCCPseudo<"wfmasb", 0xE78F, any_fma, v32sb, v32sb,
- 8, 2, "maebr">;
+ defm WFMASB : TernaryVRReAndCCPseudo<"wfmasb", 0xE78F, any_fma,
+ z_fma_reassoc, v32sb, v32sb, 8, 2, "maebr">;
def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, any_fma, v128xb, v128xb, 8, 4>;
}
}
@@ -1394,12 +1394,12 @@ let Predicates = [FeatureVector] in {
let Uses = [FPC], mayRaiseFPException = 1 in {
def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>;
- defm WFSDB : BinaryVRRcAndCCPseudo<"wfsdb", 0xE7E2, any_fsub, v64db, v64db,
- 3, 8, 0, "sdbr">;
+ defm WFSDB : BinaryVRRcAndCCPseudo<"wfsdb", 0xE7E2, any_fsub,
+ z_fsub_reassoc, v64db, v64db, 3, 8, 0, "sdbr">;
let Predicates = [FeatureVectorEnhancements1] in {
def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>;
- defm WFSSB : BinaryVRRcAndCCPseudo<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb,
- 2, 8, 0, "sebr">;
+ defm WFSSB : BinaryVRRcAndCCPseudo<"wfssb", 0xE7E2, any_fsub,
+ z_fsub_reassoc, v32sb, v32sb, 2, 8, 0, "sebr">;
def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>;
}
}
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 4a70a72a484232..701e3b580a92d6 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -744,19 +744,6 @@ defm block_and : block_op<and>;
defm block_or : block_op<or>;
defm block_xor : block_op<xor>;
-// A load (into FPR) selected only if the vector facility (/f32 enhancement)
-// is not present.
-def z_fprload : PatFrag<(ops node:$ptr), (load node:$ptr),
- [{ EVT MemVT = cast<LoadSDNode>(N)->getMemoryVT();
- EVT LoadVT = N->getValueType(0);
- assert(MemVT == LoadVT && "Unexpected load.");
- if (MemVT == MVT::f32)
- return !Subtarget->hasVectorEnhancements1();
- return !Subtarget->hasVector();
- }]>;
-def z_fpr_any_extloadf32 : PatFrag<(ops node:$ptr), (any_extloadf32 node:$ptr),
- [{ return !Subtarget->hasVector(); }]>;
-
// Insertions.
def inserti8 : PatFrag<(ops node:$src1, node:$src2),
(or (and node:$src1, -256), node:$src2)>;
@@ -842,6 +829,36 @@ def any_fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
// Floating-point negative absolute.
def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
+// Floating-point operations which will not participate in reassociation, and
+// therefore candidates for reg/mem folding during isel.
+def z_any_fadd_noreassoc : PatFrag<(ops node:$src1, node:$src2),
+ (any_fadd node:$src1, node:$src2),
+ [{ return !shouldSelectForReassoc(N); }]>;
+def z_any_fsub_noreassoc : PatFrag<(ops node:$src1, node:$src2),
+ (any_fsub node:$src1, node:$src2),
+ [{ return !shouldSelectForReassoc(N); }]>;
+def z_any_fmul_noreassoc : PatFrag<(ops node:$src1, node:$src2),
+ (any_fmul node:$src1, node:$src2),
+ [{ return !shouldSelectForReassoc(N); }]>;
+def z_any_fma_noreassoc : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (any_fma node:$src2, node:$src3, node:$src1),
+ [{ return !shouldSelectForReassoc(N); }]>;
+
+// Floating-point operations which are reassociable, and therefore should be
+// selected as reg/reg instructions (no memop folding).
+def z_fadd_reassoc : PatFrag<(ops node:$src1, node:$src2),
+ (fadd node:$src1, node:$src2),
+ [{ return shouldSelectForReassoc(N); }]>;
+def z_fsub_reassoc : PatFrag<(ops node:$src1, node:$src2),
+ (fsub node:$src1, node:$src2),
+ [{ return shouldSelectForReassoc(N); }]>;
+def z_fmul_reassoc : PatFrag<(ops node:$src1, node:$src2),
+ (fmul node:$src1, node:$src2),
+ [{ return shouldSelectForReassoc(N); }]>;
+def z_fma_reassoc : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (fma node:$src1, node:$src2, node:$src3),
+ [{ return shouldSelectForReassoc(N); }]>;
+
// Strict floating-point fragments.
def z_any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
[(z_strict_fcmp node:$lhs, node:$rhs),
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 75b8f00e026d73..be13a84ecc3fe4 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -259,7 +259,7 @@ bool SystemZPassConfig::addILPOpts() {
void SystemZPassConfig::addPreRegAlloc() {
addPass(createSystemZCopyPhysRegsPass(getSystemZTargetMachine()));
- addPass(createSystemZFinalizeRegMemPass(getSystemZTargetMachine()));
+ addPass(createSystemZFinalizeReassociationPass(getSystemZTargetMachine()));
}
void SystemZPassConfig::addPostRewrite() {
diff --git a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-01.ll b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-01.ll
deleted file mode 100644
index 84726e889d3577..00000000000000
--- a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-01.ll
+++ /dev/null
@@ -1,251 +0,0 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -O3 -print-before=peephole-opt \
-; RUN: -print-after=peephole-opt,systemz-finalize-regmem -verify-machineinstrs 2>&1 \
-; RUN: | FileCheck %s
-
-define void @f0(float %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: [[LD1:%[0-9]+:vr32bit]] = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
-; CHECK-NEXT: vr32bit = nofpexcept WFASB_CCPseudo %0:fp32bit, killed [[LD1]], implicit-def dead $cc, implicit $fpc
-; CHECK: [[LD2:%[0-9]+:vr32bit]] = VL32 %2:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
-; CHECK-NEXT: vr32bit = nofpexcept WFASB_CCPseudo %0:fp32bit, [[LD2]], implicit-def dead $cc, implicit $fpc
-; CHECK-NEXT: VST32 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp32bit = nofpexcept AEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s32) from %ir.src1)
-; CHECK: %8:vr32bit = nofpexcept WFASB_CCPseudo %0:fp32bit, [[LD2]], implicit-def dead $cc, implicit $fpc
-
-; CHECK: # *** IR Dump After SystemZ Finalize RegMem (systemz-finalize-regmem) ***:
-; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp32bit = nofpexcept AEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s32) from %ir.src1)
-; CHECK: %8:vr32bit = nofpexcept WFASB %0:fp32bit, [[LD2]], implicit $fpc
-
- %l1 = load float, ptr %src1
- %res1 = fadd float %a1, %l1
- store volatile float %res1, ptr %dst
-
- %l2 = load float, ptr %src2
- %res2 = fadd float %a1, %l2
- store volatile float %l2, ptr %dst
- store volatile float %res2, ptr %dst
-
- ret void
-}
-
-define void @f1(double %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: [[LD1:%[0-9]+:vr64bit]] = VL64 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s64) from %ir.src1)
-; CHECK-NEXT: vr64bit = nofpexcept WFADB_CCPseudo %0:fp64bit, killed [[LD1]], implicit-def dead $cc, implicit $fpc
-; CHECK: [[LD2:%[0-9]+:vr64bit]] = VL64 %2:addr64bit, 0, $noreg :: (load (s64) from %ir.src2)
-; CHECK-NEXT: vr64bit = nofpexcept WFADB_CCPseudo %0:fp64bit, [[LD2]], implicit-def dead $cc, implicit $fpc
-; CHECK-NEXT: VST64 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp64bit = nofpexcept ADB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s64) from %ir.src1)
-; CHECK: %8:vr64bit = nofpexcept WFADB_CCPseudo %0:fp64bit, [[LD2]], implicit-def dead $cc, implicit $fpc
-
-; CHECK: # *** IR Dump After SystemZ Finalize RegMem (systemz-finalize-regmem) ***:
-; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp64bit = nofpexcept ADB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s64) from %ir.src1)
-; CHECK: %8:vr64bit = nofpexcept WFADB %0:fp64bit, [[LD2]], implicit $fpc
-
- %l1 = load double, ptr %src1
- %res1 = fadd double %a1, %l1
- store volatile double %res1, ptr %dst
-
- %l2 = load double, ptr %src2
- %res2 = fadd double %a1, %l2
- store volatile double %l2, ptr %dst
- store volatile double %res2, ptr %dst
-
- ret void
-}
-
-define void @f2(float %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: [[LD1:%[0-9]+:vr32bit]] = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
-; CHECK-NEXT: vr32bit = nofpexcept WFSSB_CCPseudo %0:fp32bit, killed [[LD1]], implicit-def dead $cc, implicit $fpc
-; CHECK: [[LD2:%[0-9]+:vr32bit]] = VL32 %2:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
-; CHECK-NEXT: vr32bit = nofpexcept WFSSB_CCPseudo %0:fp32bit, [[LD2]], implicit-def dead $cc, implicit $fpc
-; CHECK-NEXT: VST32 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp32bit = nofpexcept SEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s32) from %ir.src1)
-; CHECK: %8:vr32bit = nofpexcept WFSSB_CCPseudo %0:fp32bit, [[LD2]], implicit-def dead $cc, implicit $fpc
-
-; CHECK: # *** IR Dump After SystemZ Finalize RegMem (systemz-finalize-regmem) ***:
-; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp32bit = nofpexcept SEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s32) from %ir.src1)
-; CHECK: %8:vr32bit = nofpexcept WFSSB %0:fp32bit, [[LD2]], implicit $fpc
-
- %l1 = load float, ptr %src1
- %res1 = fsub float %a1, %l1
- store volatile float %res1, ptr %dst
-
- %l2 = load float, ptr %src2
- %res2 = fsub float %a1, %l2
- store volatile float %l2, ptr %dst
- store volatile float %res2, ptr %dst
-
- ret void
-}
-
-define void @f3(double %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f3: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: [[LD1:%[0-9]+:vr64bit]] = VL64 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s64) from %ir.src1)
-; CHECK-NEXT: vr64bit = nofpexcept WFSDB_CCPseudo %0:fp64bit, killed [[LD1]], implicit-def dead $cc, implicit $fpc
-; CHECK: [[LD2:%[0-9]+:vr64bit]] = VL64 %2:addr64bit, 0, $noreg :: (load (s64) from %ir.src2)
-; CHECK-NEXT: vr64bit = nofpexcept WFSDB_CCPseudo %0:fp64bit, [[LD2]], implicit-def dead $cc, implicit $fpc
-; CHECK-NEXT: VST64 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f3: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp64bit = nofpexcept SDB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s64) from %ir.src1)
-; CHECK: %8:vr64bit = nofpexcept WFSDB_CCPseudo %0:fp64bit, [[LD2]], implicit-def dead $cc, implicit $fpc
-
-; CHECK: # *** IR Dump After SystemZ Finalize RegMem (systemz-finalize-regmem) ***:
-; CHECK-NEXT: # Machine code for function f3: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp64bit = nofpexcept SDB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit-def dead $cc, implicit $fpc :: (load (s64) from %ir.src1)
-; CHECK: %8:vr64bit = nofpexcept WFSDB %0:fp64bit, [[LD2]], implicit $fpc
-
- %l1 = load double, ptr %src1
- %res1 = fsub double %a1, %l1
- store volatile double %res1, ptr %dst
-
- %l2 = load double, ptr %src2
- %res2 = fsub double %a1, %l2
- store volatile double %l2, ptr %dst
- store volatile double %res2, ptr %dst
-
- ret void
-}
-
-define void @f4(float %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
-; CHECK-LABEL: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f4: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: [[LD1:%[0-9]+:vr32bit]] = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
-; CHECK-NEXT: vr32bit = nofpexcept WFMSB %0:fp32bit, killed [[LD1]], implicit $fpc
-; CHECK: [[LD2:%[0-9]+:vr32bit]] = VL32 %2:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
-; CHECK-NEXT: vr32bit = nofpexcept WFMSB %0:fp32bit, [[LD2]], implicit $fpc
-; CHECK-NEXT: VST32 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f4: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp32bit = nofpexcept MEEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit $fpc :: (load (s32) from %ir.src1)
-; CHECK: %8:vr32bit = nofpexcept WFMSB %0:fp32bit, [[LD2]], implicit $fpc
-
- %l1 = load float, ptr %src1
- %res1 = fmul float %a1, %l1
- store volatile float %res1, ptr %dst
-
- %l2 = load float, ptr %src2
- %res2 = fmul float %a1, %l2
- store volatile float %l2, ptr %dst
- store volatile float %res2, ptr %dst
-
- ret void
-}
-
-define void @f5(double %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
-; CHECK-LABEL: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f5: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: [[LD1:%[0-9]+:vr64bit]] = VL64 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s64) from %ir.src1)
-; CHECK-NEXT: vr64bit = nofpexcept WFMDB %0:fp64bit, killed [[LD1]], implicit $fpc
-; CHECK: [[LD2:%[0-9]+:vr64bit]] = VL64 %2:addr64bit, 0, $noreg :: (load (s64) from %ir.src2)
-; CHECK-NEXT: vr64bit = nofpexcept WFMDB %0:fp64bit, [[LD2]], implicit $fpc
-; CHECK-NEXT: VST64 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f5: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp64bit = nofpexcept MDB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit $fpc :: (load (s64) from %ir.src1)
-; CHECK: %8:vr64bit = nofpexcept WFMDB %0:fp64bit, [[LD2]], implicit $fpc
-
- %l1 = load double, ptr %src1
- %res1 = fmul double %a1, %l1
- store volatile double %res1, ptr %dst
-
- %l2 = load double, ptr %src2
- %res2 = fmul double %a1, %l2
- store volatile double %l2, ptr %dst
- store volatile double %res2, ptr %dst
-
- ret void
-}
-
-define void @f6(float %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
-; CHECK-LABEL: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f6: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: [[LD1:%[0-9]+:vr32bit]] = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
-; CHECK-NEXT: vr32bit = nofpexcept WFDSB %0:fp32bit, killed [[LD1]], implicit $fpc
-; CHECK: [[LD2:%[0-9]+:vr32bit]] = VL32 %2:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
-; CHECK-NEXT: vr32bit = nofpexcept WFDSB %0:fp32bit, [[LD2]], implicit $fpc
-; CHECK-NEXT: VST32 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f6: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp32bit = nofpexcept DEB %0:fp32bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit $fpc :: (load (s32) from %ir.src1)
-; CHECK: %8:vr32bit = nofpexcept WFDSB %0:fp32bit, [[LD2]], implicit $fpc
-
- %l1 = load float, ptr %src1
- %res1 = fdiv float %a1, %l1
- store volatile float %res1, ptr %dst
-
- %l2 = load float, ptr %src2
- %res2 = fdiv float %a1, %l2
- store volatile float %l2, ptr %dst
- store volatile float %res2, ptr %dst
-
- ret void
-}
-
-define void @f7(double %a1, ptr %src1, ptr %src2, ptr %src3, ptr %dst) {
-; CHECK-LABEL: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f7: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: [[LD1:%[0-9]+:vr64bit]] = VL64 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s64) from %ir.src1)
-; CHECK-NEXT: vr64bit = nofpexcept WFDDB %0:fp64bit, killed [[LD1]], implicit $fpc
-; CHECK: [[LD2:%[0-9]+:vr64bit]] = VL64 %2:addr64bit, 0, $noreg :: (load (s64) from %ir.src2)
-; CHECK-NEXT: vr64bit = nofpexcept WFDDB %0:fp64bit, [[LD2]], implicit $fpc
-; CHECK-NEXT: VST64 [[LD2]], %4:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f7: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp64bit = nofpexcept DDB %0:fp64bit(tied-def 0), [[ADDR1]], 0, $noreg, implicit $fpc :: (load (s64) from %ir.src1)
-; CHECK: %8:vr64bit = nofpexcept WFDDB %0:fp64bit, [[LD2]], implicit $fpc
-
- %l1 = load double, ptr %src1
- %res1 = fdiv double %a1, %l1
- store volatile double %res1, ptr %dst
-
- %l2 = load double, ptr %src2
- %res2 = fdiv double %a1, %l2
- store volatile double %l2, ptr %dst
- store volatile double %res2, ptr %dst
-
- ret void
-}
diff --git a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll
deleted file mode 100644
index 062bc1ba4042e0..00000000000000
--- a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-02.ll
+++ /dev/null
@@ -1,164 +0,0 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -O3 -print-before=peephole-opt \
-; RUN: -print-after=peephole-opt -verify-machineinstrs 2>&1 | FileCheck %s
-
-define void @f0(float %A, ptr %src, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %3:vr32bit = VL32 [[ADDR1:%[0-9]+:addr64bit]], 4, $noreg :: (load (s32) from %ir.arrayidx1)
-; CHECK-NEXT: %4:vr32bit = VL32 %1:addr64bit, 8, $noreg :: (load (s32) from %ir.arrayidx2)
-; CHECK-NEXT: vr32bit = contract nofpexcept WFMASB_CCPseudo killed %3:vr32bit, killed %4:vr32bit, %0:fp32bit, implicit-def dead $cc, implicit $fpc
-; CHECK: %6:vr32bit = VL32 %1:addr64bit, 12, $noreg :: (load (s32) from %ir.arrayidx3)
-; CHECK-NEXT: %7:vr32bit = VL32 %1:addr64bit, 16, $noreg :: (load (s32) from %ir.arrayidx4)
-; CHECK-NEXT: %8:vr32bit = contract nofpexcept WFMASB_CCPseudo %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit-def dead $cc, implicit $fpc
-; CHECK-NEXT: VST32 killed %8:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-; CHECK-NEXT: VST32 %6:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-; CHECK-NEXT: VST32 %7:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp32bit = nofpexcept MAEB %0:fp32bit(tied-def 0), killed %4:fp32bit, [[ADDR1]], 4, $noreg, implicit $fpc :: (load (s32) from %ir.arrayidx1)
-; CHECK: vr32bit = contract nofpexcept WFMASB_CCPseudo %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit-def dead $cc, implicit $fpc
-
- %arrayidx1 = getelementptr inbounds float, ptr %src, i64 1
- %arrayidx2 = getelementptr inbounds float, ptr %src, i64 2
- %L1l = load float, ptr %arrayidx1
- %L1r = load float, ptr %arrayidx2
- %M1 = fmul contract float %L1l, %L1r
- %A1 = fadd contract float %A, %M1
- store volatile float %A1, ptr %dst
-
- %arrayidx3 = getelementptr inbounds float, ptr %src, i64 3
- %arrayidx4 = getelementptr inbounds float, ptr %src, i64 4
- %L2l = load float, ptr %arrayidx3
- %L2r = load float, ptr %arrayidx4
- %M2 = fmul contract float %L2l, %L2r
- %A2 = fadd contract float %A, %M2
- store volatile float %A2, ptr %dst
- store volatile float %L2l, ptr %dst
- store volatile float %L2r, ptr %dst
-
- ret void
-}
-
-define void @f1(double %A, ptr %src, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %3:vr64bit = VL64 [[ADDR1:%[0-9]+:addr64bit]], 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: %4:vr64bit = VL64 %1:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: vr64bit = contract nofpexcept WFMADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit, %0:fp64bit, implicit-def dead $cc, implicit $fpc
-; CHECK: %6:vr64bit = VL64 %1:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx3)
-; CHECK-NEXT: %7:vr64bit = VL64 %1:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: %8:vr64bit = contract nofpexcept WFMADB_CCPseudo %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit-def dead $cc, implicit $fpc
-; CHECK-NEXT: VST64 killed %8:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-; CHECK-NEXT: VST64 %6:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-; CHECK-NEXT: VST64 %7:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp64bit = nofpexcept MADB %0:fp64bit(tied-def 0), killed %4:fp64bit, [[ADDR1]], 8, $noreg, implicit $fpc :: (load (s64) from %ir.arrayidx1)
-; CHECK: vr64bit = contract nofpexcept WFMADB_CCPseudo %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit-def dead $cc, implicit $fpc
-
- %arrayidx1 = getelementptr inbounds double, ptr %src, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %src, i64 2
- %L1l = load double, ptr %arrayidx1
- %L1r = load double, ptr %arrayidx2
- %M1 = fmul contract double %L1l, %L1r
- %A1 = fadd contract double %A, %M1
- store volatile double %A1, ptr %dst
-
- %arrayidx3 = getelementptr inbounds double, ptr %src, i64 3
- %arrayidx4 = getelementptr inbounds double, ptr %src, i64 4
- %L2l = load double, ptr %arrayidx3
- %L2r = load double, ptr %arrayidx4
- %M2 = fmul contract double %L2l, %L2r
- %A2 = fadd contract double %A, %M2
- store volatile double %A2, ptr %dst
- store volatile double %L2l, ptr %dst
- store volatile double %L2r, ptr %dst
-
- ret void
-}
-
-define void @f2(float %A, ptr %src, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %3:vr32bit = VL32 [[ADDR1:%[0-9]+:addr64bit]], 4, $noreg :: (load (s32) from %ir.arrayidx1)
-; CHECK-NEXT: %4:vr32bit = VL32 %1:addr64bit, 8, $noreg :: (load (s32) from %ir.arrayidx2)
-; CHECK-NEXT: vr32bit = nofpexcept WFMSSB killed %3:vr32bit, killed %4:vr32bit, %0:fp32bit, implicit $fpc
-; CHECK: %6:vr32bit = VL32 %1:addr64bit, 12, $noreg :: (load (s32) from %ir.arrayidx3)
-; CHECK-NEXT: %7:vr32bit = VL32 %1:addr64bit, 16, $noreg :: (load (s32) from %ir.arrayidx4)
-; CHECK-NEXT: %8:vr32bit = nofpexcept WFMSSB %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit $fpc
-; CHECK-NEXT: VST32 killed %8:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-; CHECK-NEXT: VST32 %6:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-; CHECK-NEXT: VST32 %7:vr32bit, %2:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp32bit = nofpexcept MSEB %0:fp32bit(tied-def 0), killed %4:fp32bit, [[ADDR1]], 4, $noreg, implicit $fpc :: (load (s32) from %ir.arrayidx1)
-; CHECK: vr32bit = nofpexcept WFMSSB %6:vr32bit, %7:vr32bit, %0:fp32bit, implicit $fpc
- %arrayidx1 = getelementptr inbounds float, ptr %src, i64 1
- %arrayidx2 = getelementptr inbounds float, ptr %src, i64 2
- %L1l = load float, ptr %arrayidx1
- %L1r = load float, ptr %arrayidx2
- %Negacc1 = fneg float %A
- %A1 = call float @llvm.fma.f32 (float %L1l, float %L1r, float %Negacc1)
- store volatile float %A1, ptr %dst
-
- %arrayidx3 = getelementptr inbounds float, ptr %src, i64 3
- %arrayidx4 = getelementptr inbounds float, ptr %src, i64 4
- %L2l = load float, ptr %arrayidx3
- %L2r = load float, ptr %arrayidx4
- %Negacc2 = fneg float %A
- %A2 = call float @llvm.fma.f32 (float %L2l, float %L2r, float %Negacc2)
- store volatile float %A2, ptr %dst
- store volatile float %L2l, ptr %dst
- store volatile float %L2r, ptr %dst
-
- ret void
-}
-
-define void @f3(double %A, ptr %src, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f3: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %3:vr64bit = VL64 [[ADDR1:%[0-9]+:addr64bit]], 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: %4:vr64bit = VL64 %1:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: vr64bit = nofpexcept WFMSDB killed %3:vr64bit, killed %4:vr64bit, %0:fp64bit, implicit $fpc
-; CHECK: %6:vr64bit = VL64 %1:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx3)
-; CHECK-NEXT: %7:vr64bit = VL64 %1:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: %8:vr64bit = nofpexcept WFMSDB %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit $fpc
-; CHECK-NEXT: VST64 killed %8:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-; CHECK-NEXT: VST64 %6:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-; CHECK-NEXT: VST64 %7:vr64bit, %2:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f3: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: fp64bit = nofpexcept MSDB %0:fp64bit(tied-def 0), killed %4:fp64bit, [[ADDR1]], 8, $noreg, implicit $fpc :: (load (s64) from %ir.arrayidx1)
-; CHECK: vr64bit = nofpexcept WFMSDB %6:vr64bit, %7:vr64bit, %0:fp64bit, implicit $fpc
- %arrayidx1 = getelementptr inbounds double, ptr %src, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %src, i64 2
- %L1l = load double, ptr %arrayidx1
- %L1r = load double, ptr %arrayidx2
- %Negacc1 = fneg double %A
- %A1 = call double @llvm.fma.f64 (double %L1l, double %L1r, double %Negacc1)
- store volatile double %A1, ptr %dst
-
- %arrayidx3 = getelementptr inbounds double, ptr %src, i64 3
- %arrayidx4 = getelementptr inbounds double, ptr %src, i64 4
- %L2l = load double, ptr %arrayidx3
- %L2r = load double, ptr %arrayidx4
- %Negacc2 = fneg double %A
- %A2 = call double @llvm.fma.f64 (double %L2l, double %L2r, double %Negacc2)
- store volatile double %A2, ptr %dst
- store volatile double %L2l, ptr %dst
- store volatile double %L2r, ptr %dst
-
- ret void
-}
diff --git a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-03.ll b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-03.ll
deleted file mode 100644
index 5de6bb4d6af5c5..00000000000000
--- a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-03.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -O3 -print-before=peephole-opt \
-; RUN: -print-after=peephole-opt -verify-machineinstrs 2>&1 | FileCheck %s
-
-define void @f0(ptr %src1, ptr %src2, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %3:vr32bit = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
-; CHECK-NEXT: %4:vr64bit = nofpexcept WLDEB killed %3:vr32bit, implicit $fpc
-; CHECK: %5:vr32bit = VL32 %1:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
-; CHECK-NEXT: %6:vr64bit = nofpexcept WLDEB %5:vr32bit, implicit $fpc
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %4:fp64bit = nofpexcept LDEB [[ADDR1]], 0, $noreg, implicit $fpc :: (load (s32) from %ir.src1)
-; CHECK: %5:vr32bit = VL32 %1:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
-; CHECK-NEXT: %6:vr64bit = nofpexcept WLDEB %5:vr32bit, implicit $fpc
-
- %L1 = load float, ptr %src1
- %D1 = fpext float %L1 to double
- store volatile double %D1, ptr %dst
-
- %L2 = load float, ptr %src2
- %D2 = fpext float %L2 to double
- store volatile double %D2, ptr %dst
- store volatile float %L2, ptr %dst
-
- ret void
-}
-
-define void @f1(ptr %ptr, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %2:vr32bit = VL32 [[ADDR2:%0:addr64bit]], 0, $noreg :: (load (s32) from %ir.ptr)
-; CHECK-NEXT: %3:vr32bit = nofpexcept WFSQSB killed %2:vr32bit, implicit $fpc
-; CHECK: %4:vr32bit = VL32 %0:addr64bit, 0, $noreg :: (load (s32) from %ir.ptr)
-; CHECK-NEXT: %5:vr32bit = nofpexcept WFSQSB %4:vr32bit, implicit $fpc
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %3:fp32bit = nofpexcept SQEB [[ADDR2]], 0, $noreg, implicit $fpc :: (load (s32) from %ir.ptr)
-; CHECK: %4:vr32bit = VL32 %0:addr64bit, 0, $noreg :: (load (s32) from %ir.ptr)
-; CHECK-NEXT: %5:vr32bit = nofpexcept WFSQSB %4:vr32bit, implicit $fpc
-
- %L1 = load float, ptr %ptr
- %S1 = call float @llvm.sqrt.f32(float %L1)
- store volatile float %S1, ptr %dst
-
- %L2 = load float, ptr %ptr
- %S2 = call float @llvm.sqrt.f32(float %L2)
- store volatile float %S2, ptr %dst
- store volatile float %L2, ptr %dst
-
- ret void
-}
-
-define void @f2(ptr %ptr, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %2:vr64bit = VL64 [[ADDR2:%0:addr64bit]], 0, $noreg :: (load (s64) from %ir.ptr)
-; CHECK-NEXT: %3:vr64bit = nofpexcept WFSQDB killed %2:vr64bit, implicit $fpc
-; CHECK: %4:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.ptr)
-; CHECK-NEXT: %5:vr64bit = nofpexcept WFSQDB %4:vr64bit, implicit $fpc
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f2: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %3:fp64bit = nofpexcept SQDB [[ADDR2]], 0, $noreg, implicit $fpc :: (load (s64) from %ir.ptr)
-; CHECK: %4:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.ptr)
-; CHECK-NEXT: %5:vr64bit = nofpexcept WFSQDB %4:vr64bit, implicit $fpc
-
- %L1 = load double, ptr %ptr
- %S1 = call double @llvm.sqrt.f64(double %L1)
- store volatile double %S1, ptr %dst
-
- %L2 = load double, ptr %ptr
- %S2 = call double @llvm.sqrt.f64(double %L2)
- store volatile double %S2, ptr %dst
- store volatile double %L2, ptr %dst
-
- ret void
-}
diff --git a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-04.ll b/llvm/test/CodeGen/SystemZ/fp-regmem-folding-04.ll
deleted file mode 100644
index 58710a3be6489c..00000000000000
--- a/llvm/test/CodeGen/SystemZ/fp-regmem-folding-04.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -O3 -print-before=peephole-opt \
-; RUN: -print-after=peephole-opt -verify-machineinstrs 2>&1 | FileCheck %s
-
-define void @f0(i64 %a, i64 %b, float %f1, ptr %src1, ptr %src2, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %6:vr32bit = VL32 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s32) from %ir.src1)
-; CHECK-NEXT: nofpexcept WFCSB %2:fp32bit, killed %6:vr32bit, implicit-def $cc, implicit $fpc
-; CHECK: %9:vr32bit = VL32 %4:addr64bit, 0, $noreg :: (load (s32) from %ir.src2)
-; CHECK-NEXT: nofpexcept WFCSB %2:fp32bit, %9:vr32bit, implicit-def $cc, implicit $fpc
-; CHECK: VST32 %9:vr32bit, %5:addr64bit, 0, $noreg :: (volatile store (s32) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f0: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: nofpexcept CEB %2:fp32bit, [[ADDR1]], 0, $noreg, implicit-def $cc, implicit $fpc :: (load (s32) from %ir.src1)
-; CHECK: nofpexcept WFCSB %2:fp32bit, %9:vr32bit, implicit-def $cc, implicit $fpc
-
- %L1 = load float, ptr %src1
- %C1 = fcmp oeq float %f1, %L1
- %S1 = select i1 %C1, i64 0, i64 1
- store volatile i64 %S1, ptr %dst
-
- %L2 = load float, ptr %src2
- %C2 = fcmp oeq float %f1, %L2
- %S2 = select i1 %C2, i64 0, i64 1
- store volatile i64 %S2, ptr %dst
- store volatile float %L2, ptr %dst
-
- ret void
-}
-
-define void @f1(i64 %a, i64 %b, double %f1, ptr %src1, ptr %src2, ptr %dst) {
-; CHECK: # *** IR Dump Before Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: %6:vr64bit = VL64 [[ADDR1:%[0-9]+:addr64bit]], 0, $noreg :: (load (s64) from %ir.src1)
-; CHECK-NEXT: nofpexcept WFCDB %2:fp64bit, killed %6:vr64bit, implicit-def $cc, implicit $fpc
-; CHECK: %9:vr64bit = VL64 %4:addr64bit, 0, $noreg :: (load (s64) from %ir.src2)
-; CHECK-NEXT: nofpexcept WFCDB %2:fp64bit, %9:vr64bit, implicit-def $cc, implicit $fpc
-; CHECK: VST64 %9:vr64bit, %5:addr64bit, 0, $noreg :: (volatile store (s64) into %ir.dst)
-
-; CHECK: # *** IR Dump After Peephole Optimizations (peephole-opt) ***:
-; CHECK-NEXT: # Machine code for function f1: IsSSA, TracksLiveness
-; CHECK-LABEL: bb.0 (%ir-block.0):
-; CHECK: nofpexcept CDB %2:fp64bit, [[ADDR1]], 0, $noreg, implicit-def $cc, implicit $fpc :: (load (s64) from %ir.src1)
-; CHECK: nofpexcept WFCDB %2:fp64bit, %9:vr64bit, implicit-def $cc, implicit $fpc
-
- %L1 = load double, ptr %src1
- %C1 = fcmp oeq double %f1, %L1
- %S1 = select i1 %C1, i64 0, i64 1
- store volatile i64 %S1, ptr %dst
-
- %L2 = load double, ptr %src2
- %C2 = fcmp oeq double %f1, %L2
- %S2 = select i1 %C2, i64 0, i64 1
- store volatile i64 %S2, ptr %dst
- store volatile double %L2, ptr %dst
-
- ret void
-}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll
index 72303a47dc7386..fdf1be68a5430e 100644
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll
@@ -1,7 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
; RUN: | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -stop-before=processimpdefs \
-; RUN: -O3 | FileCheck %s --check-prefix=PASSOUTPUT
; Test reassociation of fp add, subtract and multiply.
@@ -19,12 +18,6 @@ define double @fun0_fadd(ptr %x) {
; CHECK-NEXT: adbr %f0, %f1
; CHECK-NEXT: adb %f0, 56(%r2)
; CHECK-NEXT: br %r14
-
-; PASSOUTPUT: name: fun0_fadd
-; PASSOUTPUT-NOT: WFADB
-; PASSOUTPUT: WFADB killed %3, killed %18, implicit $fpc
-; PASSOUTPUT-NOT: WFADB {{.*}}$cc
-; PASSOUTPUT-NOT: WFADB_CCPseudo
entry:
%0 = load double, ptr %x, align 8
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
@@ -54,23 +47,17 @@ entry:
define float @fun1_fadd(ptr %x) {
; CHECK-LABEL: fun1_fadd:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lde %f0, 0(%r2)
-; CHECK-NEXT: aeb %f0, 4(%r2)
-; CHECK-NEXT: lde %f1, 12(%r2)
-; CHECK-NEXT: aeb %f1, 8(%r2)
-; CHECK-NEXT: aebr %f0, %f1
-; CHECK-NEXT: lde %f1, 20(%r2)
-; CHECK-NEXT: aeb %f1, 16(%r2)
-; CHECK-NEXT: aeb %f1, 24(%r2)
-; CHECK-NEXT: aebr %f0, %f1
-; CHECK-NEXT: aeb %f0, 28(%r2)
-; CHECK-NEXT: br %r14
-
-; PASSOUTPUT: name: fun1_fadd
-; PASSOUTPUT-NOT: WFASB
-; PASSOUTPUT: WFASB killed %3, killed %18, implicit $fpc
-; PASSOUTPUT-NOT: WFASB {{.*}}$cc
-; PASSOUTPUT-NOT: WFASB_CCPseudo
+; CHECK-NEXT: lde %f0, 0(%r2)
+; CHECK-NEXT: aeb %f0, 4(%r2)
+; CHECK-NEXT: lde %f1, 12(%r2)
+; CHECK-NEXT: aeb %f1, 8(%r2)
+; CHECK-NEXT: aebr %f0, %f1
+; CHECK-NEXT: lde %f1, 20(%r2)
+; CHECK-NEXT: aeb %f1, 16(%r2)
+; CHECK-NEXT: aeb %f1, 24(%r2)
+; CHECK-NEXT: aebr %f0, %f1
+; CHECK-NEXT: aeb %f0, 28(%r2)
+; CHECK-NEXT: br %r14
entry:
%0 = load float, ptr %x, align 8
%arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
@@ -100,23 +87,23 @@ entry:
define fp128 @fun2_fadd(ptr %x) {
; CHECK-LABEL: fun2_fadd:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vl %v1, 16(%r3), 3
-; CHECK-NEXT: wfaxb %v0, %v1, %v0
-; CHECK-NEXT: vl %v1, 32(%r3), 3
-; CHECK-NEXT: vl %v2, 48(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
-; CHECK-NEXT: wfaxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r3), 3
-; CHECK-NEXT: vl %v2, 80(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
-; CHECK-NEXT: wfaxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 112(%r3), 3
-; CHECK-NEXT: wfaxb %v0, %v0, %v1
-; CHECK-NEXT: vst %v0, 0(%r2), 3
-; CHECK-NEXT: br %r14
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: wfaxb %v0, %v1, %v0
+; CHECK-NEXT: vl %v1, 32(%r3), 3
+; CHECK-NEXT: vl %v2, 48(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfaxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r3), 3
+; CHECK-NEXT: vl %v2, 80(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfaxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r3), 3
+; CHECK-NEXT: wfaxb %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
entry:
%0 = load fp128, ptr %x, align 8
%arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1
@@ -146,22 +133,22 @@ entry:
define <2 x double> @fun3_fadd(ptr %x) {
; CHECK-LABEL: fun3_fadd:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vl %v1, 16(%r2), 3
-; CHECK-NEXT: vfadb %v0, %v1, %v0
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
-; CHECK-NEXT: vfadb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
-; CHECK-NEXT: vfadb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 112(%r2), 3
-; CHECK-NEXT: vfadb %v24, %v0, %v1
-; CHECK-NEXT: br %r14
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfadb %v0, %v1, %v0
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfadb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfadb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfadb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
entry:
%0 = load <2 x double>, ptr %x, align 8
%arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1
@@ -191,22 +178,22 @@ entry:
define <4 x float> @fun4_fadd(ptr %x) {
; CHECK-LABEL: fun4_fadd:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vl %v1, 16(%r2), 3
-; CHECK-NEXT: vfasb %v0, %v1, %v0
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
-; CHECK-NEXT: vfasb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
-; CHECK-NEXT: vfasb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 112(%r2), 3
-; CHECK-NEXT: vfasb %v24, %v0, %v1
-; CHECK-NEXT: br %r14
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfasb %v0, %v1, %v0
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfasb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfasb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfasb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
entry:
%0 = load <4 x float>, ptr %x, align 8
%arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1
@@ -236,23 +223,17 @@ entry:
define double @fun5_fsub(ptr %x) {
; CHECK-LABEL: fun5_fsub:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: ld %f0, 0(%r2)
-; CHECK-NEXT: sdb %f0, 8(%r2)
-; CHECK-NEXT: ld %f1, 24(%r2)
-; CHECK-NEXT: adb %f1, 16(%r2)
-; CHECK-NEXT: sdbr %f0, %f1
-; CHECK-NEXT: ld %f1, 40(%r2)
-; CHECK-NEXT: adb %f1, 32(%r2)
-; CHECK-NEXT: adb %f1, 48(%r2)
-; CHECK-NEXT: sdbr %f0, %f1
-; CHECK-NEXT: sdb %f0, 56(%r2)
-; CHECK-NEXT: br %r14
-
-; PASSOUTPUT: name: fun5_fsub
-; PASSOUTPUT-NOT: WFSDB
-; PASSOUTPUT: WFSDB killed %3, killed %18, implicit $fpc
-; PASSOUTPUT-NOT: WFSDB {{.*}}$cc
-; PASSOUTPUT-NOT: WFSDB_CCPseudo
+; CHECK-NEXT: ld %f0, 0(%r2)
+; CHECK-NEXT: sdb %f0, 8(%r2)
+; CHECK-NEXT: ld %f1, 24(%r2)
+; CHECK-NEXT: adb %f1, 16(%r2)
+; CHECK-NEXT: sdbr %f0, %f1
+; CHECK-NEXT: ld %f1, 40(%r2)
+; CHECK-NEXT: adb %f1, 32(%r2)
+; CHECK-NEXT: adb %f1, 48(%r2)
+; CHECK-NEXT: sdbr %f0, %f1
+; CHECK-NEXT: sdb %f0, 56(%r2)
+; CHECK-NEXT: br %r14
entry:
%0 = load double, ptr %x, align 8
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
@@ -282,23 +263,17 @@ entry:
define float @fun6_fsub(ptr %x) {
; CHECK-LABEL: fun6_fsub:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lde %f0, 0(%r2)
-; CHECK-NEXT: seb %f0, 4(%r2)
-; CHECK-NEXT: lde %f1, 12(%r2)
-; CHECK-NEXT: aeb %f1, 8(%r2)
-; CHECK-NEXT: sebr %f0, %f1
-; CHECK-NEXT: lde %f1, 20(%r2)
-; CHECK-NEXT: aeb %f1, 16(%r2)
-; CHECK-NEXT: aeb %f1, 24(%r2)
-; CHECK-NEXT: sebr %f0, %f1
-; CHECK-NEXT: seb %f0, 28(%r2)
-; CHECK-NEXT: br %r14
-
-; PASSOUTPUT: name: fun6_fsub
-; PASSOUTPUT-NOT: WFSSB
-; PASSOUTPUT: WFSSB killed %3, killed %18, implicit $fpc
-; PASSOUTPUT-NOT: WFSSB {{.*}}$cc
-; PASSOUTPUT-NOT: WFSSB_CCPseudo
+; CHECK-NEXT: lde %f0, 0(%r2)
+; CHECK-NEXT: seb %f0, 4(%r2)
+; CHECK-NEXT: lde %f1, 12(%r2)
+; CHECK-NEXT: aeb %f1, 8(%r2)
+; CHECK-NEXT: sebr %f0, %f1
+; CHECK-NEXT: lde %f1, 20(%r2)
+; CHECK-NEXT: aeb %f1, 16(%r2)
+; CHECK-NEXT: aeb %f1, 24(%r2)
+; CHECK-NEXT: sebr %f0, %f1
+; CHECK-NEXT: seb %f0, 28(%r2)
+; CHECK-NEXT: br %r14
entry:
%0 = load float, ptr %x, align 8
%arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
@@ -328,23 +303,23 @@ entry:
define fp128 @fun7_fsub(ptr %x) {
; CHECK-LABEL: fun7_fsub:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vl %v1, 16(%r3), 3
-; CHECK-NEXT: wfsxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r3), 3
-; CHECK-NEXT: vl %v2, 48(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
-; CHECK-NEXT: wfsxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r3), 3
-; CHECK-NEXT: vl %v2, 80(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
-; CHECK-NEXT: wfsxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 112(%r3), 3
-; CHECK-NEXT: wfsxb %v0, %v0, %v1
-; CHECK-NEXT: vst %v0, 0(%r2), 3
-; CHECK-NEXT: br %r14
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: wfsxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r3), 3
+; CHECK-NEXT: vl %v2, 48(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfsxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r3), 3
+; CHECK-NEXT: vl %v2, 80(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r3), 3
+; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfsxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r3), 3
+; CHECK-NEXT: wfsxb %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
entry:
%0 = load fp128, ptr %x, align 8
%arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1
@@ -374,22 +349,22 @@ entry:
define <2 x double> @fun8_fsub(ptr %x) {
; CHECK-LABEL: fun8_fsub:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vl %v1, 16(%r2), 3
-; CHECK-NEXT: vfsdb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
-; CHECK-NEXT: vfsdb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
-; CHECK-NEXT: vfsdb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 112(%r2), 3
-; CHECK-NEXT: vfsdb %v24, %v0, %v1
-; CHECK-NEXT: br %r14
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfsdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfsdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfsdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfsdb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
entry:
%0 = load <2 x double>, ptr %x, align 8
%arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1
@@ -419,22 +394,22 @@ entry:
define <4 x float> @fun9_fsub(ptr %x) {
; CHECK-LABEL: fun9_fsub:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vl %v1, 16(%r2), 3
-; CHECK-NEXT: vfssb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
-; CHECK-NEXT: vfssb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
-; CHECK-NEXT: vfssb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 112(%r2), 3
-; CHECK-NEXT: vfssb %v24, %v0, %v1
-; CHECK-NEXT: br %r14
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfssb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfssb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfssb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfssb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
entry:
%0 = load <4 x float>, ptr %x, align 8
%arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1
@@ -464,23 +439,17 @@ entry:
define double @fun10_fmul(ptr %x) {
; CHECK-LABEL: fun10_fmul:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: ld %f0, 8(%r2)
-; CHECK-NEXT: mdb %f0, 0(%r2)
-; CHECK-NEXT: ld %f1, 24(%r2)
-; CHECK-NEXT: mdb %f1, 16(%r2)
-; CHECK-NEXT: mdbr %f0, %f1
-; CHECK-NEXT: ld %f1, 40(%r2)
-; CHECK-NEXT: mdb %f1, 32(%r2)
-; CHECK-NEXT: mdb %f1, 48(%r2)
-; CHECK-NEXT: mdbr %f0, %f1
-; CHECK-NEXT: mdb %f0, 56(%r2)
-; CHECK-NEXT: br %r14
-
-; PASSOUTPUT: name: fun10_fmul
-; PASSOUTPUT-NOT: WFMDB
-; PASSOUTPUT: WFMDB killed %3, killed %18, implicit $fpc
-; PASSOUTPUT-NOT: WFMDB {{.*}}$cc
-; PASSOUTPUT-NOT: WFMDB_CCPseudo
+; CHECK-NEXT: ld %f0, 8(%r2)
+; CHECK-NEXT: mdb %f0, 0(%r2)
+; CHECK-NEXT: ld %f1, 24(%r2)
+; CHECK-NEXT: mdb %f1, 16(%r2)
+; CHECK-NEXT: mdbr %f0, %f1
+; CHECK-NEXT: ld %f1, 40(%r2)
+; CHECK-NEXT: mdb %f1, 32(%r2)
+; CHECK-NEXT: mdb %f1, 48(%r2)
+; CHECK-NEXT: mdbr %f0, %f1
+; CHECK-NEXT: mdb %f0, 56(%r2)
+; CHECK-NEXT: br %r14
entry:
%0 = load double, ptr %x, align 8
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
@@ -510,23 +479,17 @@ entry:
define float @fun11_fmul(ptr %x) {
; CHECK-LABEL: fun11_fmul:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: lde %f0, 4(%r2)
-; CHECK-NEXT: meeb %f0, 0(%r2)
-; CHECK-NEXT: lde %f1, 12(%r2)
-; CHECK-NEXT: meeb %f1, 8(%r2)
-; CHECK-NEXT: meebr %f0, %f1
-; CHECK-NEXT: lde %f1, 20(%r2)
-; CHECK-NEXT: meeb %f1, 16(%r2)
-; CHECK-NEXT: meeb %f1, 24(%r2)
-; CHECK-NEXT: meebr %f0, %f1
-; CHECK-NEXT: meeb %f0, 28(%r2)
-; CHECK-NEXT: br %r14
-
-; PASSOUTPUT: name: fun11_fmul
-; PASSOUTPUT-NOT: WFMSB
-; PASSOUTPUT: WFMSB killed %3, killed %18, implicit $fpc
-; PASSOUTPUT-NOT: WFMSB {{.*}}$cc
-; PASSOUTPUT-NOT: WFMSB_CCPseudo
+; CHECK-NEXT: lde %f0, 4(%r2)
+; CHECK-NEXT: meeb %f0, 0(%r2)
+; CHECK-NEXT: lde %f1, 12(%r2)
+; CHECK-NEXT: meeb %f1, 8(%r2)
+; CHECK-NEXT: meebr %f0, %f1
+; CHECK-NEXT: lde %f1, 20(%r2)
+; CHECK-NEXT: meeb %f1, 16(%r2)
+; CHECK-NEXT: meeb %f1, 24(%r2)
+; CHECK-NEXT: meebr %f0, %f1
+; CHECK-NEXT: meeb %f0, 28(%r2)
+; CHECK-NEXT: br %r14
entry:
%0 = load float, ptr %x, align 8
%arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
@@ -556,23 +519,23 @@ entry:
define fp128 @fun12_fmul(ptr %x) {
; CHECK-LABEL: fun12_fmul:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vl %v1, 16(%r3), 3
-; CHECK-NEXT: wfmxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r3), 3
-; CHECK-NEXT: vl %v2, 48(%r3), 3
-; CHECK-NEXT: wfmxb %v1, %v1, %v2
-; CHECK-NEXT: wfmxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r3), 3
-; CHECK-NEXT: vl %v2, 80(%r3), 3
-; CHECK-NEXT: wfmxb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r3), 3
-; CHECK-NEXT: wfmxb %v1, %v1, %v2
-; CHECK-NEXT: wfmxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 112(%r3), 3
-; CHECK-NEXT: wfmxb %v0, %v0, %v1
-; CHECK-NEXT: vst %v0, 0(%r2), 3
-; CHECK-NEXT: br %r14
+; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: wfmxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r3), 3
+; CHECK-NEXT: vl %v2, 48(%r3), 3
+; CHECK-NEXT: wfmxb %v1, %v1, %v2
+; CHECK-NEXT: wfmxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r3), 3
+; CHECK-NEXT: vl %v2, 80(%r3), 3
+; CHECK-NEXT: wfmxb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r3), 3
+; CHECK-NEXT: wfmxb %v1, %v1, %v2
+; CHECK-NEXT: wfmxb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r3), 3
+; CHECK-NEXT: wfmxb %v0, %v0, %v1
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
entry:
%0 = load fp128, ptr %x, align 8
%arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1
@@ -602,22 +565,22 @@ entry:
define <2 x double> @fun13_fmul(ptr %x) {
; CHECK-LABEL: fun13_fmul:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vl %v1, 16(%r2), 3
-; CHECK-NEXT: vfmdb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfmdb %v1, %v1, %v2
-; CHECK-NEXT: vfmdb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfmdb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfmdb %v1, %v1, %v2
-; CHECK-NEXT: vfmdb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 112(%r2), 3
-; CHECK-NEXT: vfmdb %v24, %v0, %v1
-; CHECK-NEXT: br %r14
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfmdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfmdb %v1, %v1, %v2
+; CHECK-NEXT: vfmdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfmdb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfmdb %v1, %v1, %v2
+; CHECK-NEXT: vfmdb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfmdb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
entry:
%0 = load <2 x double>, ptr %x, align 8
%arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1
@@ -647,22 +610,22 @@ entry:
define <4 x float> @fun14_fmul(ptr %x) {
; CHECK-LABEL: fun14_fmul:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vl %v0, 0(%r2), 3
-; CHECK-NEXT: vl %v1, 16(%r2), 3
-; CHECK-NEXT: vfmsb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfmsb %v1, %v1, %v2
-; CHECK-NEXT: vfmsb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfmsb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfmsb %v1, %v1, %v2
-; CHECK-NEXT: vfmsb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 112(%r2), 3
-; CHECK-NEXT: vfmsb %v24, %v0, %v1
-; CHECK-NEXT: br %r14
+; CHECK-NEXT: vl %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vfmsb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 32(%r2), 3
+; CHECK-NEXT: vl %v2, 48(%r2), 3
+; CHECK-NEXT: vfmsb %v1, %v1, %v2
+; CHECK-NEXT: vfmsb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 64(%r2), 3
+; CHECK-NEXT: vl %v2, 80(%r2), 3
+; CHECK-NEXT: vfmsb %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 96(%r2), 3
+; CHECK-NEXT: vfmsb %v1, %v1, %v2
+; CHECK-NEXT: vfmsb %v0, %v0, %v1
+; CHECK-NEXT: vl %v1, 112(%r2), 3
+; CHECK-NEXT: vfmsb %v24, %v0, %v1
+; CHECK-NEXT: br %r14
entry:
%0 = load <4 x float>, ptr %x, align 8
%arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
index c45bc19f3d926d..9511a5807d4c1d 100644
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
@@ -1,9 +1,10 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
; RUN: -print-before=machine-combiner -print-after=machine-combiner -ppc-fma \
; RUN: 2>&1 | FileCheck %s
-
; REQUIRES: asserts
+; Test reassociation involving fma using a ppc pattern.
+
define double @fun0_fma2_add(ptr %x, double %A, double %B) {
; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
; CHECK-NEXT: # Machine code for function fun0_fma2_add: IsSSA, TracksLiveness
@@ -61,13 +62,12 @@ define double @fun1_fma2_add_divop(ptr %x, double %A, double %B) {
; CHECK-NEXT: %0:addr64bit = COPY $r2d
; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: %7:vr64bit = nofpexcept WFDDB %5:vr64bit, killed %6:vr64bit, implicit $fpc
-; CHECK-NEXT: %8:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit
-; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit, killed %8:vr64bit
-; CHECK-NEXT: %10:vr64bit = {{.*}} WFMADB_CCPseudo %5:vr64bit, killed %7:vr64bit, killed %9:vr64bit
-; CHECK-NEXT: $f0d = COPY %10:vr64bit
+; CHECK-NEXT: %5:fp64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: %6:fp64bit = {{.*}} DDB %5:fp64bit(tied-def 0), %0:addr64bit, 24, $noreg
+; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit
+; CHECK-NEXT: %8:vr64bit = {{.*}} WFMADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit, killed %7:vr64bit
+; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo %5:fp64bit, killed %6:fp64bit, killed %8:vr64bit
+; CHECK-NEXT: $f0d = COPY %9:vr64bit
; CHECK-NEXT: Return implicit $f0d
entry:
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
index fd6a41f2a717e4..04a1298144aa12 100644
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
@@ -1,9 +1,10 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
; RUN: -print-before=machine-combiner -print-after=machine-combiner -z-fma \
; RUN: 2>&1 | FileCheck %s
-
; REQUIRES: asserts
+; Test reassociation involving fma.
+
; The incoming accumulator is stalling so it is worth putting the
; multiplications in parallell with it.
define double @fun0_fma2_divop(ptr %x) {
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
index 1167dbfd06c702..b43ac9d4d528a4 100644
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
@@ -1,9 +1,10 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
; RUN: -print-before=machine-combiner -print-after=machine-combiner -z-fma \
; RUN: 2>&1 | FileCheck %s
-
; REQUIRES: asserts
+; Test reassociation involving fma.
+
; No improvement possible.
define double @fun0_fma1add(ptr %x) {
; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
@@ -46,18 +47,17 @@ define double @fun1_fma1add_divop(ptr %x) {
; CHECK-NEXT: %0:addr64bit = COPY $r2d
; CHECK-NEXT: [[M21:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
; CHECK-NEXT: [[M22:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: [[T1:%3:vr64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB [[T1]], killed %4:vr64bit, implicit $fpc
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFADB_CCPseudo [[T1]], killed [[DIV]]
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %6:vr64bit
-; CHECK-NEXT: $f0d = COPY %7:vr64bit
+; CHECK-NEXT: [[T1:%3:fp64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
+; CHECK-NEXT: [[DIV:%4:fp64bit]] = nofpexcept DDB %3:fp64bit(tied-def 0), %0:addr64bit, 24, $noreg
+; CHECK-NEXT: %5:vr64bit = {{.*}} WFADB_CCPseudo [[T1]], killed [[DIV]]
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %5:vr64bit
+; CHECK-NEXT: $f0d = COPY %6:vr64bit
; CHECK-NEXT: Return implicit $f0d
; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
; CHECK-NEXT: # Machine code for function fun1_fma1add_divop: IsSSA, TracksLiveness
-; CHECK: %8:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[T1]]
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo %8:vr64bit, killed [[DIV]]
+; CHECK: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[T1]]
+; CHECK-NEXT: %6:vr64bit = {{.*}} WFADB_CCPseudo %7:vr64bit, killed [[DIV]]
entry:
%arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
%arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
index 9a8fa90ef70b64..8bacf8eec64965 100644
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
@@ -1,11 +1,9 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 -print-before=machine-combiner \
; RUN: -print-after=machine-combiner -debug-only=machine-combiner,systemz-II -z-fma 2>&1 \
; RUN: | FileCheck %s
-
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 \
; RUN: -print-after=machine-combiner -debug-only=machine-combiner,systemz-II -ppc-fma 2>&1 \
; RUN: | FileCheck %s --check-prefix=ALT
-
; REQUIRES: asserts
; Test transformation of a sequence of 8 FMAs, with different patterns.
>From 8ec23a65a6b6d817cae5cefc6c19a81c8c2fd38b Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 16 Apr 2024 10:30:22 +0200
Subject: [PATCH 5/9] Final
---
.../llvm/CodeGen/MachineCombinerPattern.h | 9 -
llvm/lib/CodeGen/MachineCombiner.cpp | 7 -
.../SystemZ/SystemZFinalizeReassociation.cpp | 38 +-
.../Target/SystemZ/SystemZISelDAGToDAG.cpp | 3 +-
llvm/lib/Target/SystemZ/SystemZInstrFP.td | 6 +-
.../lib/Target/SystemZ/SystemZInstrFormats.td | 19 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 525 +++---------------
llvm/lib/Target/SystemZ/SystemZInstrInfo.h | 26 +-
llvm/lib/Target/SystemZ/SystemZInstrVector.td | 21 +-
llvm/lib/Target/SystemZ/SystemZOperators.td | 11 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ13.td | 2 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ14.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ15.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ16.td | 4 +-
.../Target/SystemZ/SystemZTargetMachine.cpp | 8 +-
llvm/test/CodeGen/SystemZ/fp-add-01.ll | 12 +
llvm/test/CodeGen/SystemZ/fp-add-02.ll | 2 +-
llvm/test/CodeGen/SystemZ/fp-mul-01.ll | 12 +
llvm/test/CodeGen/SystemZ/fp-mul-02.ll | 12 +-
llvm/test/CodeGen/SystemZ/fp-mul-03.ll | 14 +
llvm/test/CodeGen/SystemZ/fp-sub-01.ll | 12 +
llvm/test/CodeGen/SystemZ/fp-sub-02.ll | 14 +
.../SystemZ/machine-combiner-reassoc-fp-03.ll | 91 ---
.../SystemZ/machine-combiner-reassoc-fp-04.ll | 124 -----
.../SystemZ/machine-combiner-reassoc-fp-08.ll | 116 ----
.../SystemZ/machine-combiner-reassoc-fp-09.ll | 175 ------
...p-01.ll => machine-combiner-reassoc-fp.ll} | 0
27 files changed, 177 insertions(+), 1094 deletions(-)
delete mode 100644 llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
delete mode 100644 llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
delete mode 100644 llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
delete mode 100644 llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
rename llvm/test/CodeGen/SystemZ/{machine-combiner-reassoc-fp-01.ll => machine-combiner-reassoc-fp.ll} (100%)
diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
index b9d568f3e230ef..3428c4dde5c7fc 100644
--- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -34,15 +34,6 @@ enum MachineCombinerPattern : unsigned {
REASSOC_XA_YB,
TARGET_PATTERN_START
- // SystemZ patterns. (EXPERIMENTAL)
- FMA2_P1P0,
- FMA2_P0P1,
- FMA2,
- FMA1_Add_L,
- FMA1_Add_R,
- FMA3, // These are inspired by PPC
- FMA2_Add, //
-
};
} // end namespace llvm
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index a1ccca790fca9a..c11263163a34ff 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -306,13 +306,6 @@ CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) {
case MachineCombinerPattern::REASSOC_AX_YB:
case MachineCombinerPattern::REASSOC_XA_BY:
case MachineCombinerPattern::REASSOC_XA_YB:
- case MachineCombinerPattern::FMA2_P1P0:
- case MachineCombinerPattern::FMA2_P0P1:
- case MachineCombinerPattern::FMA2:
- case MachineCombinerPattern::FMA1_Add_L:
- case MachineCombinerPattern::FMA1_Add_R:
- case MachineCombinerPattern::FMA3:
- case MachineCombinerPattern::FMA2_Add:
return CombinerObjective::MustReduceDepth;
default:
return TII->getCombinerObjective(Pattern);
diff --git a/llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp b/llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp
index 2b5b66afa317b5..c98ef2df214b78 100644
--- a/llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp
@@ -1,4 +1,4 @@
-//===---- SystemZFinalizeReassociation.cpp - Finalize FP reassociation ----===//
+//===----- SystemZFinalizeReassociation.cpp - Finalize reassociation ------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -11,12 +11,13 @@
//
// 1. Instruction selection: Disable reg/mem folding for any operations that
// are reassociable since MachineCombiner will not succeed otherwise.
-// Select a reg/reg pseudo that pretends to clobber CC since the reg/mem
+// Select a reg/reg pseudo that pretends to clobber CC if the reg/mem
// opcode clobbers it.
//
-// 2. MachineCombiner: Performs reassociation with the reg/reg instructions.
+// 2. MachineCombiner: reassociation with the reg/reg instructions.
//
-// 3. PeepholeOptimizer: Fold loads into reg/mem instructions.
+// 3. PeepholeOptimizer: Fold loads and reg/reg pseudos into reg/mem
+// instructions.
//
// 4. This pass: Convert any remaining reg/reg pseudos.
//
@@ -33,8 +34,7 @@ namespace {
class SystemZFinalizeReassociation : public MachineFunctionPass {
public:
static char ID;
- SystemZFinalizeReassociation()
- : MachineFunctionPass(ID), TII(nullptr) {
+ SystemZFinalizeReassociation() : MachineFunctionPass(ID), TII(nullptr) {
initializeSystemZFinalizeReassociationPass(*PassRegistry::getPassRegistry());
}
@@ -42,7 +42,6 @@ class SystemZFinalizeReassociation : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override;
private:
-
bool visitMBB(MachineBasicBlock &MBB);
const SystemZInstrInfo *TII;
@@ -55,8 +54,8 @@ char SystemZFinalizeReassociation::ID = 0;
INITIALIZE_PASS(SystemZFinalizeReassociation, "systemz-finalize-reassoc",
"SystemZ Finalize Reassociation", false, false)
-FunctionPass *llvm::
-createSystemZFinalizeReassociationPass(SystemZTargetMachine &TM) {
+FunctionPass *
+llvm::createSystemZFinalizeReassociationPass(SystemZTargetMachine &TM) {
return new SystemZFinalizeReassociation();
}
@@ -70,18 +69,17 @@ bool SystemZFinalizeReassociation::visitMBB(MachineBasicBlock &MBB) {
for (MachineInstr &MI : MBB) {
unsigned PseudoOpcode = MI.getOpcode();
unsigned TargetOpcode =
- PseudoOpcode == SystemZ::WFADB_CCPseudo ? SystemZ::WFADB
- : PseudoOpcode == SystemZ::WFASB_CCPseudo ? SystemZ::WFASB
- : PseudoOpcode == SystemZ::WFSDB_CCPseudo ? SystemZ::WFSDB
- : PseudoOpcode == SystemZ::WFSSB_CCPseudo ? SystemZ::WFSSB
- : PseudoOpcode == SystemZ::WFMADB_CCPseudo ? SystemZ::WFMADB
- : PseudoOpcode == SystemZ::WFMASB_CCPseudo ? SystemZ::WFMASB
- : 0;
+ PseudoOpcode == SystemZ::WFADB_CCPseudo ? SystemZ::WFADB
+ : PseudoOpcode == SystemZ::WFASB_CCPseudo ? SystemZ::WFASB
+ : PseudoOpcode == SystemZ::WFSDB_CCPseudo ? SystemZ::WFSDB
+ : PseudoOpcode == SystemZ::WFSSB_CCPseudo ? SystemZ::WFSSB
+ : 0;
if (TargetOpcode) {
- MI.setDesc(TII->get(TargetOpcode));
- int CCIdx = MI.findRegisterDefOperandIdx(SystemZ::CC);
- MI.removeOperand(CCIdx);
- Changed = true;
+ MI.setDesc(TII->get(TargetOpcode));
+ int CCIdx = MI.findRegisterDefOperandIdx(SystemZ::CC, /*isDead=*/true);
+ assert(CCIdx != -1 && "Expected dead CC-def.");
+ MI.removeOperand(CCIdx);
+ Changed = true;
}
}
return Changed;
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 02ee0b9e2920eb..1bf23c2e8e4127 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -2053,8 +2053,7 @@ bool SystemZDAGToDAGISel::shouldSelectForReassoc(SDNode *N) const {
EVT VT = N->getValueType(0);
assert(VT.isFloatingPoint() && "Expected FP SDNode");
return N->getFlags().hasAllowReassociation() &&
- N->getFlags().hasNoSignedZeros() &&
- Subtarget->hasVector() &&
+ N->getFlags().hasNoSignedZeros() && Subtarget->hasVector() &&
(VT != MVT::f32 || Subtarget->hasVectorEnhancements1()) &&
!N->isStrictFPOpcode();
}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 5968fe8d744a55..aad04a2b4159cb 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -501,10 +501,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;
- defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma_noreassoc, FP32,
- FP32, z_load, 4>;
- defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma_noreassoc, FP64,
- FP64, z_load, 8>;
+ defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, z_load, 4>;
+ defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, z_load, 8>;
}
// Fused multiply-subtract.
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index a776d5d4dad490..62a7d93106bc68 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5537,6 +5537,9 @@ multiclass StringRRE<string mnemonic, bits<16> opcode,
GR32:$char))]>;
}
+// Duplicate the instruction with a pseudo that defines CC that will be
+// selected in cases where reassociation is enabled. The CC operand is needed
+// in order to do later reg/mem folding into instructions that clobber CC.
multiclass BinaryVRRcAndCCPseudo<string mnemonic, bits<16> opcode,
SDPatternOperator operator,
SDPatternOperator reassoc_operator,
@@ -5551,19 +5554,3 @@ multiclass BinaryVRRcAndCCPseudo<string mnemonic, bits<16> opcode,
(reassoc_operator (tr2.vt tr2.op:$V2),
(tr2.vt tr2.op:$V3)))]>;
}
-
-multiclass TernaryVRReAndCCPseudo<string mnemonic, bits<16> opcode,
- SDPatternOperator operator,
- SDPatternOperator reassoc_operator,
- TypedReg tr1, TypedReg tr2, bits<4> m5 = 0,
- bits<4> type = 0, string fp_mnemonic = ""> {
- def "" : TernaryVRRe<mnemonic, opcode, operator, tr1, tr2, m5, type,
- fp_mnemonic>;
- let Defs = [CC], AddedComplexity = 1 in // Win over "".
- def _CCPseudo : Pseudo<(outs tr1.op:$V1),
- (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
- [(set (tr1.vt tr1.op:$V1),
- (reassoc_operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3),
- (tr1.vt tr1.op:$V4)))]>;
-}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index ff47b56447fbab..dee1b365e029d9 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -21,7 +21,6 @@
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -619,22 +618,16 @@ static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) {
}
}
-void SystemZInstrInfo::transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI,
- MachineInstr::MIFlag Flag) const {
+static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI,
+ MachineInstr::MIFlag Flag) {
if (OldMI->getFlag(Flag))
NewMI->setFlag(Flag);
}
-static cl::opt<bool> DISABLE_FOLDING("disable-folding", cl::init(false));
-static cl::opt<bool> FOLD_LDY("fold-ldy", cl::init(false));
-
MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
- if (DISABLE_FOLDING)
- return nullptr;
-
const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
// Check whether we can move the DefMI load, and that it only has one use.
@@ -648,80 +641,54 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
// For reassociable FP operations, any loads have been purposefully left
// unfolded so that MachineCombiner can do its work on reg/reg
// opcodes. After that, as many loads as possible are now folded.
- unsigned LoadOpcD12 = 0;
- unsigned LoadOpcD20 = 0;
+ unsigned LoadOpc = 0;
unsigned RegMemOpcode = 0;
const TargetRegisterClass *FPRC = nullptr;
- RegMemOpcode = MI.getOpcode() == SystemZ::WFADB_CCPseudo ? SystemZ::ADB
- : MI.getOpcode() == SystemZ::WFSDB_CCPseudo ? SystemZ::SDB
- : MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
- : MI.getOpcode() == SystemZ::WFMADB_CCPseudo ? SystemZ::MADB
- : 0;
+ RegMemOpcode = MI.getOpcode() == SystemZ::WFADB_CCPseudo ? SystemZ::ADB
+ : MI.getOpcode() == SystemZ::WFSDB_CCPseudo ? SystemZ::SDB
+ : MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
+ : 0;
if (RegMemOpcode) {
- LoadOpcD12 = SystemZ::VL64;
- LoadOpcD20 = SystemZ::LDY;
+ LoadOpc = SystemZ::VL64;
FPRC = &SystemZ::FP64BitRegClass;
} else {
- RegMemOpcode = MI.getOpcode() == SystemZ::WFASB_CCPseudo ? SystemZ::AEB
- : MI.getOpcode() == SystemZ::WFSSB_CCPseudo ? SystemZ::SEB
- : MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
- : MI.getOpcode() == SystemZ::WFMASB_CCPseudo ? SystemZ::MAEB
- : 0;
+ RegMemOpcode = MI.getOpcode() == SystemZ::WFASB_CCPseudo ? SystemZ::AEB
+ : MI.getOpcode() == SystemZ::WFSSB_CCPseudo ? SystemZ::SEB
+ : MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
+ : 0;
if (RegMemOpcode) {
- LoadOpcD12 = SystemZ::VL32;
- LoadOpcD20 = SystemZ::LEY;
+ LoadOpc = SystemZ::VL32;
FPRC = &SystemZ::FP32BitRegClass;
}
}
-
- if (!RegMemOpcode ||
- (DefMI->getOpcode() != LoadOpcD12 && DefMI->getOpcode() != LoadOpcD20))
+ if (!RegMemOpcode || DefMI->getOpcode() != LoadOpc)
return nullptr;
+ assert((MI.findRegisterDefOperandIdx(SystemZ::CC) == -1 ||
+ MI.findRegisterDefOperandIdx(SystemZ::CC, /*isDead=*/true) != -1) &&
+ "Expected dead CC-def on add/sub pseudo instruction.");
- if (DefMI->getOpcode() == LoadOpcD20 && !FOLD_LDY)
- return nullptr;
-
- DebugLoc DL = MI.getDebugLoc();
Register DstReg = MI.getOperand(0).getReg();
MachineOperand LHS = MI.getOperand(1);
MachineOperand RHS = MI.getOperand(2);
- bool IsTernary =
- (RegMemOpcode == SystemZ::MADB || RegMemOpcode == SystemZ::MAEB);
MachineOperand &RegMO = RHS.getReg() == FoldAsLoadDefReg ? LHS : RHS;
- MachineOperand *AccMO = IsTernary ? &MI.getOperand(3) : nullptr;
if ((RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB) &&
FoldAsLoadDefReg != RHS.getReg())
return nullptr;
- if (IsTernary && FoldAsLoadDefReg == AccMO->getReg())
- return nullptr;
-
- MachineInstrBuilder MIB =
- BuildMI(*MI.getParent(), MI, DL, get(RegMemOpcode), DstReg);
- MRI->setRegClass(DstReg, FPRC);
- if (IsTernary) {
- MIB.add(*AccMO);
- MRI->setRegClass(AccMO->getReg(), FPRC);
- }
- MIB.add(RegMO);
- MRI->setRegClass(RegMO.getReg(), FPRC);
MachineOperand &Base = DefMI->getOperand(1);
MachineOperand &Disp = DefMI->getOperand(2);
MachineOperand &Indx = DefMI->getOperand(3);
- if (Base.isReg()) // Could be a FrameIndex.
- Base.setIsKill(false);
- Indx.setIsKill(false);
- if (DefMI->getOpcode() == LoadOpcD12) {
- MIB.add(Base).add(Disp).add(Indx);
- } else {
- Register AddrReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass);
- BuildMI(*MI.getParent(), *MIB, DL, get(SystemZ::LAY), AddrReg)
- .add(Base).add(Disp).add(Indx);
- MIB.addReg(AddrReg).addImm(0).addReg(SystemZ::NoRegister);
- }
- MIB.addMemOperand(*DefMI->memoperands_begin());
- transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);
+ MachineInstrBuilder MIB =
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(RegMemOpcode), DstReg)
+ .add(RegMO)
+ .add(Base)
+ .add(Disp)
+ .add(Indx)
+ .addMemOperand(*DefMI->memoperands_begin());
MIB->addRegisterDead(SystemZ::CC, TRI);
+ MRI->setRegClass(DstReg, FPRC);
+ MRI->setRegClass(RegMO.getReg(), FPRC);
+ transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);
return MIB;
}
@@ -1106,148 +1073,21 @@ SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
return nullptr;
}
-static bool hasReassocFlags(const MachineInstr *MI) {
- return (MI->getFlag(MachineInstr::MIFlag::FmReassoc) &&
- MI->getFlag(MachineInstr::MIFlag::FmNsz));
-}
-
-bool SystemZInstrInfo::IsReassociableFMA(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
- case SystemZ::VFMADB:
- case SystemZ::VFMASB:
- case SystemZ::WFMAXB:
- case SystemZ::WFMADB_CCPseudo:
- case SystemZ::WFMASB_CCPseudo:
- return hasReassocFlags(MI);
- default:
- break;
- }
- return false;
-}
-
-bool SystemZInstrInfo::IsReassociableAdd(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
- case SystemZ::VFADB:
- case SystemZ::VFASB:
- case SystemZ::WFAXB:
- return hasReassocFlags(MI);
- case SystemZ::WFADB_CCPseudo:
- case SystemZ::WFASB_CCPseudo:
- return hasReassocFlags(MI) &&
- MI->findRegisterDefOperandIdx(SystemZ::CC, true/*isDead*/) != -1;
- default:
- break;
- }
- return false;
-}
-
-// EXPERIMENTAL
-static cl::opt<bool> Z_FMA("z-fma", cl::init(false));
-static cl::opt<bool> PPC_FMA("ppc-fma", cl::init(false));
-
-bool SystemZInstrInfo::getFMAPatterns(
- MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
- bool DoRegPressureReduce) const {
- assert(Patterns.empty());
- MachineBasicBlock *MBB = Root.getParent();
- const MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo();
-
- if (!IsReassociableFMA(&Root))
- return false;
-
- const TargetRegisterClass *RC = MRI->getRegClass(Root.getOperand(0).getReg());
-
- // This is more or less always true.
- auto AllOpsOK = [&MRI, &RC](const MachineInstr &Instr) {
- for (const auto &MO : Instr.explicit_operands())
- if (!(MO.isReg() && MO.getReg().isVirtual() && !MO.getSubReg()))
- return false;
- const TargetRegisterClass *DefRC = MRI->getRegClass(Instr.getOperand(0).getReg());
- if (!DefRC->hasSubClassEq(RC) && !DefRC->hasSuperClassEq(RC))
- return false;
- return true;
- };
- if (!AllOpsOK(Root))
- return false;
-
- // XXX Rewrite this for the patterns we want to actually use.
- MachineInstr *TopAdd = nullptr;
- std::vector<MachineInstr *> FMAChain;
- FMAChain.push_back(&Root);
- Register Acc = Root.getOperand(3).getReg();
- while (MachineInstr *Prev = MRI->getUniqueVRegDef(Acc)) {
- if (Prev->getParent() != MBB || !MRI->hasOneNonDBGUse(Acc) ||
- !AllOpsOK(*Prev))
- break;
- if (IsReassociableFMA(Prev)) {
- FMAChain.push_back(Prev);
- Acc = Prev->getOperand(3).getReg();
- continue;
- }
- if (IsReassociableAdd(Prev))
- TopAdd = Prev;
- break;
- }
-
- if (Z_FMA) {
- if (FMAChain.size() >= 2) {
- Patterns.push_back(MachineCombinerPattern::FMA2_P1P0);
- LLVM_DEBUG(dbgs() << "add pattern FMA2_P1P0\n");
- Patterns.push_back(MachineCombinerPattern::FMA2_P0P1);
- LLVM_DEBUG(dbgs() << "add pattern FMA2_P0P1\n");
- Patterns.push_back(MachineCombinerPattern::FMA2);
- LLVM_DEBUG(dbgs() << "add pattern FMA2\n");
- }
- if (FMAChain.size() == 1 && TopAdd) {
- // The latency of the FMA could potentially be hidden above the add:
- // Try both sides of the add and let MachineCombiner decide on
- // profitability.
- Patterns.push_back(MachineCombinerPattern::FMA1_Add_L);
- LLVM_DEBUG(dbgs() << "add pattern FMA1_Add_L\n");
- Patterns.push_back(MachineCombinerPattern::FMA1_Add_R);
- LLVM_DEBUG(dbgs() << "add pattern FMA1_Add_R\n");
- }
- } else if (PPC_FMA) {
- if (FMAChain.size() >= 3) {
- Patterns.push_back(MachineCombinerPattern::FMA3);
- LLVM_DEBUG(dbgs() << "add pattern FMA3\n");
- }
- if (FMAChain.size() == 2 && TopAdd) {
- Patterns.push_back(MachineCombinerPattern::FMA2_Add);
- LLVM_DEBUG(dbgs() << "add pattern FMA2_Add\n");
- }
- }
-
- return Patterns.size() > 0;
-}
-
-bool SystemZInstrInfo::getMachineCombinerPatterns(
- MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
- bool DoRegPressureReduce) const {
-
- if (getFMAPatterns(Root, Patterns, DoRegPressureReduce))
- return true;
-
- return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
- DoRegPressureReduce);
-}
-
void SystemZInstrInfo::finalizeInsInstrs(
- MachineInstr &Root, MachineCombinerPattern &P,
+ MachineInstr &Root, unsigned &P,
SmallVectorImpl<MachineInstr *> &InsInstrs) const {
const TargetRegisterInfo *TRI =
- Root.getParent()->getParent()->getSubtarget().getRegisterInfo();
+ Root.getParent()->getParent()->getSubtarget().getRegisterInfo();
for (auto *Inst : InsInstrs) {
switch (Inst->getOpcode()) {
case SystemZ::WFADB_CCPseudo:
case SystemZ::WFASB_CCPseudo:
case SystemZ::WFSDB_CCPseudo:
case SystemZ::WFSSB_CCPseudo:
- case SystemZ::WFMADB_CCPseudo:
- case SystemZ::WFMASB_CCPseudo:
Inst->addRegisterDead(SystemZ::CC, TRI);
break;
- default: break;
+ default:
+ break;
}
}
}
@@ -1266,17 +1106,21 @@ bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
default:
break;
// Adds and multiplications.
- case SystemZ::VFADB:
- case SystemZ::VFASB:
- case SystemZ::WFAXB:
case SystemZ::WFADB_CCPseudo:
case SystemZ::WFASB_CCPseudo:
- case SystemZ::VFMDB:
- case SystemZ::VFMSB:
- case SystemZ::WFMXB:
+ assert(Inst.findRegisterDefOperandIdx(SystemZ::CC, /*isDead=*/true) != -1 &&
+ "Expected dead CC-def on add/sub pseudo instruction.");
+ LLVM_FALLTHROUGH;
+ case SystemZ::WFAXB:
+ case SystemZ::VFADB:
+ case SystemZ::VFASB:
case SystemZ::WFMDB:
case SystemZ::WFMSB:
- return hasReassocFlags(&Inst);
+ case SystemZ::WFMXB:
+ case SystemZ::VFMDB:
+ case SystemZ::VFMSB:
+ return (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+ Inst.getFlag(MachineInstr::MIFlag::FmNsz));
}
return false;
@@ -1284,271 +1128,32 @@ bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
std::optional<unsigned>
SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const {
- // fadd <=> fsub in various forms.
+ // fadd => fsub
switch (Opcode) {
- case SystemZ::VFADB: return SystemZ::VFSDB;
- case SystemZ::VFASB: return SystemZ::VFSSB;
- case SystemZ::WFAXB: return SystemZ::WFSXB;
- case SystemZ::WFADB_CCPseudo: return SystemZ::WFSDB_CCPseudo;
- case SystemZ::WFASB_CCPseudo: return SystemZ::WFSSB_CCPseudo;
- case SystemZ::VFSDB: return SystemZ::VFADB;
- case SystemZ::VFSSB: return SystemZ::VFASB;
- case SystemZ::WFSXB: return SystemZ::WFAXB;
- case SystemZ::WFSDB_CCPseudo: return SystemZ::WFADB_CCPseudo;
- case SystemZ::WFSSB_CCPseudo: return SystemZ::WFASB_CCPseudo;
- default: return std::nullopt;
- }
-}
-
-void SystemZInstrInfo::genAlternativeCodeSequence(
- MachineInstr &Root, MachineCombinerPattern Pattern,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- SmallVectorImpl<MachineInstr *> &DelInstrs,
- DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
- switch (Pattern) {
- case MachineCombinerPattern::FMA2_P1P0:
- case MachineCombinerPattern::FMA2_P0P1:
- case MachineCombinerPattern::FMA2:
- case MachineCombinerPattern::FMA1_Add_L:
- case MachineCombinerPattern::FMA1_Add_R:
- case MachineCombinerPattern::FMA3:
- case MachineCombinerPattern::FMA2_Add:
- reassociateFMA(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg);
- break;
- default:
- // Reassociate default patterns.
- TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
- DelInstrs, InstrIdxForVirtReg);
- break;
- }
-}
-
-static void getSplitFMAOpcodes(unsigned FMAOpc, unsigned &AddOpc,
- unsigned &MulOpc) {
- switch (FMAOpc) {
- case SystemZ::VFMADB: AddOpc = SystemZ::VFADB; MulOpc = SystemZ::VFMDB; break;
- case SystemZ::VFMASB: AddOpc = SystemZ::VFASB; MulOpc = SystemZ::VFMSB; break;
- case SystemZ::WFMAXB: AddOpc = SystemZ::WFAXB; MulOpc = SystemZ::WFMXB; break;
- case SystemZ::WFMADB_CCPseudo:
- AddOpc = SystemZ::WFADB_CCPseudo; MulOpc = SystemZ::WFMDB; break;
- case SystemZ::WFMASB_CCPseudo:
- AddOpc = SystemZ::WFASB_CCPseudo; MulOpc = SystemZ::WFMSB; break;
- default:
- llvm_unreachable("Expected FMA opcode.");
- }
-}
-
-void SystemZInstrInfo::reassociateFMA(
- MachineInstr &Root, MachineCombinerPattern Pattern,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- SmallVectorImpl<MachineInstr *> &DelInstrs,
- DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
- MachineFunction *MF = Root.getMF();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- const TargetRegisterInfo *TRI = &getRegisterInfo();
-
- const TargetRegisterClass *RC = Root.getRegClassConstraint(0, this, TRI);
- Register DstReg = Root.getOperand(0).getReg();
- std::vector<MachineInstr *> Chain; // XXX Rework this method for final patterns used.
- Chain.push_back(&Root);
-
- uint16_t IntersectedFlags = Root.getFlags();
- auto getIntersectedFlags = [&]() {
- for (auto *MI : Chain)
- IntersectedFlags &= MI->getFlags();
- };
-
- auto createNewVReg = [&](unsigned NewInsIdx) -> Register {
- Register NewReg = MRI.createVirtualRegister(RC);
- InstrIdxForVirtReg.insert(std::make_pair(NewReg, NewInsIdx));
- return NewReg;
- };
-
- auto finalizeNewMIs = [&](ArrayRef<MachineInstr *> NewMIs) {
- for (auto *MI : NewMIs) {
- setSpecialOperandAttr(*MI, IntersectedFlags);
- MI->addRegisterDead(SystemZ::CC, TRI);
- InsInstrs.push_back(MI);
- }
- };
-
- auto deleteOld = [&InsInstrs, &DelInstrs, &Chain]() {
- assert(!InsInstrs.empty() &&
- "Insertion instructions set should not be empty!");
- // Record old instructions for deletion.
- for (auto *MI : make_range(Chain.rbegin(), Chain.rend()))
- DelInstrs.push_back(MI);
- };
-
- assert(IsReassociableFMA(&Root));
- unsigned FMAOpc = Root.getOpcode();
- unsigned AddOpc, MulOpc;
- getSplitFMAOpcodes(FMAOpc, AddOpc, MulOpc);
-
-#ifndef NDEBUG
- auto IsAllFMA = [&Chain, &FMAOpc]() {
- for (auto *MI : Chain)
- if (MI->getOpcode() != FMAOpc)
- return false;
- return true;
- };
-#endif
-
- switch (Pattern) {
- case MachineCombinerPattern::FMA2_P1P0:
- case MachineCombinerPattern::FMA2_P0P1: {
- if (Pattern == MachineCombinerPattern::FMA2_P1P0)
- LLVM_DEBUG(dbgs() << "reassociating using pattern FMA_P1P0\n");
- else
- LLVM_DEBUG(dbgs() << "reassociating using pattern FMA_P0P1\n");
- Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
- assert(IsAllFMA());
- getIntersectedFlags(); // XXX Refactor (here and below)
- Register NewVRA = createNewVReg(0);
- Register NewVRB = createNewVReg(1);
- unsigned FirstMulIdx =
- Pattern == MachineCombinerPattern::FMA2_P1P0 ? 1 : 0;
- unsigned SecondMulIdx = FirstMulIdx == 0 ? 1 : 0;
- MachineInstr *MINewA =
- BuildMI(*MF, Chain[FirstMulIdx]->getDebugLoc(), get(MulOpc), NewVRA)
- .add(Chain[FirstMulIdx]->getOperand(1))
- .add(Chain[FirstMulIdx]->getOperand(2));
- MachineInstr *MINewB =
- BuildMI(*MF, Chain[SecondMulIdx]->getDebugLoc(), get(FMAOpc), NewVRB)
- .add(Chain[SecondMulIdx]->getOperand(1))
- .add(Chain[SecondMulIdx]->getOperand(2))
- .addReg(NewVRA);
- MachineInstr *MINewC =
- BuildMI(*MF, Chain[1]->getDebugLoc(), get(AddOpc), DstReg)
- .add(Chain[1]->getOperand(3))
- .addReg(NewVRB);
- finalizeNewMIs({MINewA, MINewB, MINewC});
- break;
- }
- case MachineCombinerPattern::FMA2: {
- LLVM_DEBUG(dbgs() << "reassociating using pattern FMA2\n");
- Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
- assert(IsAllFMA());
- getIntersectedFlags();
- Register NewVRA = createNewVReg(0);
- MachineInstr *MINewA =
- BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRA)
- .add(Chain[0]->getOperand(1))
- .add(Chain[0]->getOperand(2))
- .add(Chain[1]->getOperand(3));
- MachineInstr *MINewB =
- BuildMI(*MF, Chain[1]->getDebugLoc(), get(FMAOpc), DstReg)
- .add(Chain[1]->getOperand(1))
- .add(Chain[1]->getOperand(2))
- .addReg(NewVRA);
- finalizeNewMIs({MINewA, MINewB});
- break;
- }
- case MachineCombinerPattern::FMA1_Add_L:
- case MachineCombinerPattern::FMA1_Add_R: {
- if (Pattern == MachineCombinerPattern::FMA1_Add_L)
- LLVM_DEBUG(dbgs() << "reassociating using pattern FMA1_Add_L\n");
- else
- LLVM_DEBUG(dbgs() << "reassociating using pattern FMA1_Add_R\n");
- assert(IsAllFMA());
- Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
- assert(Chain.back()->getOpcode() == AddOpc && "Expected matching Add");
- getIntersectedFlags();
- unsigned Op = Pattern == MachineCombinerPattern::FMA1_Add_L ? 1 : 2;
- unsigned OtherOp = Op == 1 ? 2 : 1;
- Register NewVRA = createNewVReg(0);
- MachineInstr *MINewA =
- BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRA)
- .add(Chain[0]->getOperand(1))
- .add(Chain[0]->getOperand(2))
- .add(Chain[1]->getOperand(Op));
- MachineInstr *MINewB =
- BuildMI(*MF, Chain[1]->getDebugLoc(), get(AddOpc), DstReg)
- .addReg(NewVRA)
- .add(Chain[1]->getOperand(OtherOp));
- finalizeNewMIs({MINewA, MINewB});
- break;
- }
- case MachineCombinerPattern::FMA3: {
- LLVM_DEBUG(dbgs() << "reassociating using pattern FMA3\n");
- Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
- Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
- assert(IsAllFMA());
- getIntersectedFlags();
- Register NewVRA = createNewVReg(0);
- Register NewVRB = createNewVReg(1);
- Register NewVRC = createNewVReg(2);
- MachineInstr *MINewA =
- BuildMI(*MF, Chain[2]->getDebugLoc(), get(MulOpc), NewVRA)
- .add(Chain[2]->getOperand(1))
- .add(Chain[2]->getOperand(2));
- MachineInstr *MINewB =
- BuildMI(*MF, Chain[1]->getDebugLoc(), get(FMAOpc), NewVRB)
- .add(Chain[1]->getOperand(1))
- .add(Chain[1]->getOperand(2))
- .add(Chain[2]->getOperand(3));
- MachineInstr *MINewC =
- BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRC)
- .add(Chain[0]->getOperand(1))
- .add(Chain[0]->getOperand(2))
- .addReg(NewVRA);
- MachineInstr *MINewD =
- BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), DstReg)
- .addReg(NewVRB)
- .addReg(NewVRC);
- finalizeNewMIs({MINewA, MINewB, MINewC, MINewD});
- break;
- }
- case MachineCombinerPattern::FMA2_Add: {
- LLVM_DEBUG(dbgs() << "reassociating using pattern FMA2_Add\n");
- Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
- assert(IsAllFMA());
- Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg()));
- assert(Chain.back()->getOpcode() == AddOpc && "Expected matching Add");
- getIntersectedFlags();
- Register NewVRA = createNewVReg(0);
- Register NewVRB = createNewVReg(1);
- MachineInstr *MINewA =
- BuildMI(*MF, Chain[1]->getDebugLoc(), get(FMAOpc), NewVRA)
- .add(Chain[1]->getOperand(1))
- .add(Chain[1]->getOperand(2))
- .add(Chain[2]->getOperand(1));
- MachineInstr *MINewB =
- BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRB)
- .add(Chain[0]->getOperand(1))
- .add(Chain[0]->getOperand(2))
- .add(Chain[2]->getOperand(2));
- MachineInstr *MINewC =
- BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), DstReg)
- .addReg(NewVRA)
- .addReg(NewVRB);
- finalizeNewMIs({MINewA, MINewB, MINewC});
- break;
- }
+ case SystemZ::WFADB_CCPseudo:
+ return SystemZ::WFSDB_CCPseudo;
+ case SystemZ::WFASB_CCPseudo:
+ return SystemZ::WFSSB_CCPseudo;
+ case SystemZ::WFAXB:
+ return SystemZ::WFSXB;
+ case SystemZ::VFADB:
+ return SystemZ::VFSDB;
+ case SystemZ::VFASB:
+ return SystemZ::VFSSB;
+ // fsub => fadd
+ case SystemZ::WFSDB_CCPseudo:
+ return SystemZ::WFADB_CCPseudo;
+ case SystemZ::WFSSB_CCPseudo:
+ return SystemZ::WFASB_CCPseudo;
+ case SystemZ::WFSXB:
+ return SystemZ::WFAXB;
+ case SystemZ::VFSDB:
+ return SystemZ::VFADB;
+ case SystemZ::VFSSB:
+ return SystemZ::VFASB;
default:
- llvm_unreachable("not recognized pattern!");
+ return std::nullopt;
}
-
- deleteOld();
-}
-
-bool
-SystemZInstrInfo::accumulateInstrSeqToRootLatency(MachineInstr &Root) const {
- // This doesn't make much sense for FMA patterns as they typically use an
- // extra Add to do things in parallell.
- if (IsReassociableFMA(&Root)) // XXX Fine tune this a bit depending on
- // used patterns.
- return false;
-
- return true;
-}
-
-void SystemZInstrInfo::setSpecialOperandAttr(MachineInstr &MI,
- uint32_t Flags) const {
- MI.setFlags(Flags);
- MI.clearFlag(MachineInstr::MIFlag::NoSWrap);
- MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
- MI.clearFlag(MachineInstr::MIFlag::IsExact);
}
MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 035a191491a7aa..b0511407b6cd11 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -254,8 +254,6 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
const DebugLoc &DL, Register DstReg,
ArrayRef<MachineOperand> Cond, Register TrueReg,
Register FalseReg) const override;
- void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI,
- MachineInstr::MIFlag Flag) const;
MachineInstr *optimizeLoadInstr(MachineInstr &MI,
MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
@@ -294,34 +292,12 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
LiveIntervals *LIS) const override;
bool useMachineCombiner() const override { return true; }
- bool IsReassociableFMA(const MachineInstr *MI) const;
- bool IsReassociableAdd(const MachineInstr *MI) const;
- bool getFMAPatterns(MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern> &P,
- bool DoRegPressureReduce) const;
- bool getMachineCombinerPatterns(MachineInstr &Root,
- SmallVectorImpl<MachineCombinerPattern> &P,
- bool DoRegPressureReduce) const override;
void
- finalizeInsInstrs(MachineInstr &Root, MachineCombinerPattern &P,
+ finalizeInsInstrs(MachineInstr &Root, unsigned &P,
SmallVectorImpl<MachineInstr *> &InsInstrs) const override;
bool isAssociativeAndCommutative(const MachineInstr &Inst,
bool Invert) const override;
std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;
- void genAlternativeCodeSequence(
- MachineInstr &Root, MachineCombinerPattern Pattern,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- SmallVectorImpl<MachineInstr *> &DelInstrs,
- DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const override;
- void reassociateFMA(
- MachineInstr &Root, MachineCombinerPattern Pattern,
- SmallVectorImpl<MachineInstr *> &InsInstrs,
- SmallVectorImpl<MachineInstr *> &DelInstrs,
- DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
- bool accumulateInstrSeqToRootLatency(MachineInstr &Root) const override;
- // SystemZ specific version of setSpecialOperandAttr that copies Flags to
- // MI and clears nuw, nsw, and exact flags.
- void setSpecialOperandAttr(MachineInstr &MI, uint32_t Flags) const;
MachineInstr *
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 696edf6ce09bf7..965da938fb2f6e 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1286,20 +1286,17 @@ let Predicates = [FeatureVector] in {
}
}
- // Multiply and add. 64/32-bit may participate in reassociation during
- // machine-combining together with additions. Pretend that they clobber CC
- // so that an Add that gets pulled down to its place can later be folded to
- // a reg/mem, which clobber CC (while MADB/MAEB do not).
+ // Multiply and add.
let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
- def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
- def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, any_fma, v128db, v128db, 0, 3>;
- defm WFMADB : TernaryVRReAndCCPseudo<"wfmadb", 0xE78F, any_fma,
- z_fma_reassoc, v64db, v64db, 8, 3, "madbr">;
+ def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
+ def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, any_fma, v128db, v128db, 0, 3>;
+ def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, any_fma, v64db, v64db, 8, 3,
+ "madbr">;
let Predicates = [FeatureVectorEnhancements1] in {
- def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, any_fma, v128sb, v128sb, 0, 2>;
- defm WFMASB : TernaryVRReAndCCPseudo<"wfmasb", 0xE78F, any_fma,
- z_fma_reassoc, v32sb, v32sb, 8, 2, "maebr">;
- def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, any_fma, v128xb, v128xb, 8, 4>;
+ def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, any_fma, v128sb, v128sb, 0, 2>;
+ def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, any_fma, v32sb, v32sb, 8, 2,
+ "maebr">;
+ def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, any_fma, v128xb, v128xb, 8, 4>;
}
}
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 701e3b580a92d6..e02f52426526a5 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -830,7 +830,7 @@ def any_fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
// Floating-point operations which will not participate in reassociation, and
-// therefore candidates for reg/mem folding during isel.
+// therefore are candidates for reg/mem folding during isel.
def z_any_fadd_noreassoc : PatFrag<(ops node:$src1, node:$src2),
(any_fadd node:$src1, node:$src2),
[{ return !shouldSelectForReassoc(N); }]>;
@@ -840,9 +840,6 @@ def z_any_fsub_noreassoc : PatFrag<(ops node:$src1, node:$src2),
def z_any_fmul_noreassoc : PatFrag<(ops node:$src1, node:$src2),
(any_fmul node:$src1, node:$src2),
[{ return !shouldSelectForReassoc(N); }]>;
-def z_any_fma_noreassoc : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (any_fma node:$src2, node:$src3, node:$src1),
- [{ return !shouldSelectForReassoc(N); }]>;
// Floating-point operations which are reassociable, and therefore should be
// selected as reg/reg instructions (no memop folding).
@@ -852,12 +849,6 @@ def z_fadd_reassoc : PatFrag<(ops node:$src1, node:$src2),
def z_fsub_reassoc : PatFrag<(ops node:$src1, node:$src2),
(fsub node:$src1, node:$src2),
[{ return shouldSelectForReassoc(N); }]>;
-def z_fmul_reassoc : PatFrag<(ops node:$src1, node:$src2),
- (fmul node:$src1, node:$src2),
- [{ return shouldSelectForReassoc(N); }]>;
-def z_fma_reassoc : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (fma node:$src1, node:$src2, node:$src3),
- [{ return shouldSelectForReassoc(N); }]>;
// Strict floating-point fragments.
def z_any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index 431e916523dae3..5975d884144a1a 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -1352,7 +1352,7 @@ def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMDB$")>;
def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFMDB$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)DB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(A|S)DB(_CCPseudo)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(A|S)DB$")>;
// Divide / square root
def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index 652ab3ea932c5f..f18d304d7a8a37 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -1401,9 +1401,9 @@ def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMSB$")>;
def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)$")>;
def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB(_CCPseudo)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB(_CCPseudo)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
// Divide / square root
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index d53e4d4b97219f..52fa35c5038ccf 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -1443,9 +1443,9 @@ def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>;
def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB(_CCPseudo)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB(_CCPseudo)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
// Divide / square root
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index 42628cea69555f..975671d1a24436 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -1449,9 +1449,9 @@ def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>;
def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB(_CCPseudo)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB(_CCPseudo)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>;
def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>;
// Divide / square root
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index be13a84ecc3fe4..3ffeb923930333 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -30,10 +30,10 @@
using namespace llvm;
-static cl::opt<bool>
-EnableMachineCombinerPass("systemz-machine-combiner",
- cl::desc("Enable the machine combiner pass"),
- cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableMachineCombinerPass(
+ "systemz-machine-combiner",
+ cl::desc("Enable the machine combiner pass"),
+ cl::init(true), cl::Hidden);
// NOLINTNEXTLINE(readability-identifier-naming)
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
diff --git a/llvm/test/CodeGen/SystemZ/fp-add-01.ll b/llvm/test/CodeGen/SystemZ/fp-add-01.ll
index f60fb8345b4a48..eb845bae9b804b 100644
--- a/llvm/test/CodeGen/SystemZ/fp-add-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-add-01.ll
@@ -119,3 +119,15 @@ define float @f7(ptr %ptr0) {
ret float %add10
}
+
+; Check that reassociation flags do not get in the way of AEB.
+define float @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: aeb %f0
+entry:
+ %0 = load float, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+ %1 = load float, ptr %arrayidx1, align 8
+ %add = fadd reassoc nsz arcp contract afn float %1, %0
+ ret float %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-add-02.ll b/llvm/test/CodeGen/SystemZ/fp-add-02.ll
index 8f65161b5bae83..7866f98240eab3 100644
--- a/llvm/test/CodeGen/SystemZ/fp-add-02.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-add-02.ll
@@ -119,7 +119,7 @@ define double @f7(ptr %ptr0) {
ret double %add10
}
-; Check that reassociation flags do not get in the way of adb.
+; Check that reassociation flags do not get in the way of ADB.
define double @f8(ptr %x) {
; CHECK-LABEL: f8:
; CHECK: ld %f0
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-01.ll b/llvm/test/CodeGen/SystemZ/fp-mul-01.ll
index 144e3208c5eb75..c5e66ff72c2a40 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-01.ll
@@ -119,3 +119,15 @@ define float @f7(ptr %ptr0) {
ret float %mul10
}
+
+; Check that reassociation flags do not get in the way of MEEB.
+define float @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: meeb %f0
+entry:
+ %0 = load float, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+ %1 = load float, ptr %arrayidx1, align 8
+ %add = fmul reassoc nsz arcp contract afn float %1, %0
+ ret float %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-02.ll b/llvm/test/CodeGen/SystemZ/fp-mul-02.ll
index 1ac4bbec352d1c..5a99537493cd19 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-02.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-02.ll
@@ -1,6 +1,6 @@
; Test multiplication of two f32s, producing an f64 result.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
declare float @foo()
@@ -201,13 +201,3 @@ define float @f7(ptr %ptr0) {
ret float %trunc9
}
-
-; Check that reassociation flags do not get in the way of mdebr.
-define double @f8(float %Src) {
-; CHECK-LABEL: f8:
-; CHECK: mdebr %f0, %f0
-; CHECK: br %r14
- %D = fpext float %Src to double
- %res = fmul reassoc nsz arcp contract afn double %D, %D
- ret double %res
-}
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-03.ll b/llvm/test/CodeGen/SystemZ/fp-mul-03.ll
index dbd6975af41304..820fdbd6f5bdb2 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-03.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-03.ll
@@ -119,3 +119,17 @@ define double @f7(ptr %ptr0) {
ret double %mul10
}
+
+; Check that reassociation flags do not get in the way of MDB.
+define double @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: ld %f0
+; CHECK: mdb %f0
+; CHECK: br %r14
+entry:
+ %0 = load double, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %1 = load double, ptr %arrayidx1, align 8
+ %add = fmul reassoc nsz arcp contract afn double %1, %0
+ ret double %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-sub-01.ll b/llvm/test/CodeGen/SystemZ/fp-sub-01.ll
index a6e01112619ee8..e875fa3be735b0 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sub-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sub-01.ll
@@ -119,3 +119,15 @@ define float @f7(ptr %ptr0) {
ret float %sub10
}
+
+; Check that reassociation flags do not get in the way of SEB.
+define float @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: seb %f0
+entry:
+ %0 = load float, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1
+ %1 = load float, ptr %arrayidx1, align 8
+ %add = fsub reassoc nsz arcp contract afn float %1, %0
+ ret float %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-sub-02.ll b/llvm/test/CodeGen/SystemZ/fp-sub-02.ll
index c564c2de31887d..3219b6e4be8f34 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sub-02.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sub-02.ll
@@ -119,3 +119,17 @@ define double @f7(ptr %ptr0) {
ret double %sub10
}
+
+; Check that reassociation flags do not get in the way of SDB.
+define double @f8(ptr %x) {
+; CHECK-LABEL: f8:
+; CHECK: ld %f0
+; CHECK: sdb %f0
+; CHECK: br %r14
+entry:
+ %0 = load double, ptr %x, align 8
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ %1 = load double, ptr %arrayidx1, align 8
+ %add = fsub reassoc nsz arcp contract afn double %1, %0
+ ret double %add
+}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
deleted file mode 100644
index 9511a5807d4c1d..00000000000000
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll
+++ /dev/null
@@ -1,91 +0,0 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
-; RUN: -print-before=machine-combiner -print-after=machine-combiner -ppc-fma \
-; RUN: 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
-; Test reassociation involving fma using a ppc pattern.
-
-define double @fun0_fma2_add(ptr %x, double %A, double %B) {
-; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun0_fma2_add: IsSSA, TracksLiveness
-; CHECK: bb.0.entry:
-; CHECK-NEXT: liveins: $r2d, $f0d, $f2d
-; CHECK-NEXT: [[Y:%2:fp64bit]] = COPY $f2d
-; CHECK-NEXT: [[X:%1:fp64bit]] = COPY $f0d
-; CHECK-NEXT: %0:addr64bit = COPY $r2d
-; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
-; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo [[X]], [[Y]]
-; CHECK-NEXT: %8:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21:%3:vr64bit]], killed [[M22:%4:vr64bit]], killed %7:vr64bit
-; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M31:%5:vr64bit]], killed [[M32:%6:vr64bit]], killed %8:vr64bit
-; CHECK-NEXT: $f0d = COPY %9:vr64bit
-; CHECK-NEXT: Return implicit $f0d
-
-; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun0_fma2_add: IsSSA, TracksLiveness
-; CHECK: %10:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[X]]
-; CHECK-NEXT: %11:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M31]], killed [[M32]], [[Y]]
-; CHECK-NEXT: %9:vr64bit = {{.*}} WFADB_CCPseudo %10:vr64bit, %11:vr64bit
-; CHECK-NEXT: $f0d = COPY %9:vr64bit
-; CHECK-NEXT: Return implicit $f0d
-entry:
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-
- %0 = load double, ptr %x
- %1 = load double, ptr %arrayidx1
- %2 = load double, ptr %arrayidx2
- %3 = load double, ptr %arrayidx4
-
- %mul1 = fmul reassoc nsz contract double %0, %1
- %mul2 = fmul reassoc nsz contract double %2, %3
-
- %A1 = fadd reassoc nsz contract double %A, %B
- %A2 = fadd reassoc nsz contract double %A1, %mul1
- %A3 = fadd reassoc nsz contract double %A2, %mul2
-
- ret double %A3
-}
-
-; Same as above, but with a long-latency factor in the root FMA which makes
-; this undesirable.
-define double @fun1_fma2_add_divop(ptr %x, double %A, double %B) {
-; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun1_fma2_add_divop: IsSSA, TracksLiveness
-; CHECK: bb.0.entry:
-; CHECK-NEXT: liveins: $r2d, $f0d, $f2d
-; CHECK-NEXT: %2:fp64bit = COPY $f2d
-; CHECK-NEXT: %1:fp64bit = COPY $f0d
-; CHECK-NEXT: %0:addr64bit = COPY $r2d
-; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
-; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: %5:fp64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: %6:fp64bit = {{.*}} DDB %5:fp64bit(tied-def 0), %0:addr64bit, 24, $noreg
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit
-; CHECK-NEXT: %8:vr64bit = {{.*}} WFMADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit, killed %7:vr64bit
-; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo %5:fp64bit, killed %6:fp64bit, killed %8:vr64bit
-; CHECK-NEXT: $f0d = COPY %9:vr64bit
-; CHECK-NEXT: Return implicit $f0d
-entry:
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-
- %0 = load double, ptr %x
- %1 = load double, ptr %arrayidx1
- %2 = load double, ptr %arrayidx2
- %3 = load double, ptr %arrayidx4
- %div = fdiv double %2, %3
-
- %mul1 = fmul reassoc nsz contract double %0, %1
- %mul2 = fmul reassoc nsz contract double %2, %div
-
- %A1 = fadd reassoc nsz contract double %A, %B
- %A2 = fadd reassoc nsz contract double %A1, %mul1
- %A3 = fadd reassoc nsz contract double %A2, %mul2
-
- ret double %A3
-}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
deleted file mode 100644
index 04a1298144aa12..00000000000000
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
-; RUN: -print-before=machine-combiner -print-after=machine-combiner -z-fma \
-; RUN: 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
-; Test reassociation involving fma.
-
-; The incoming accumulator is stalling so it is worth putting the
-; multiplications in parallell with it.
-define double @fun0_fma2_divop(ptr %x) {
-; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun0_fma2_divop: IsSSA, TracksLiveness
-; CHECK: bb.0.entry:
-; CHECK-NEXT: liveins: $r2d
-; CHECK-NEXT: %0:addr64bit = COPY $r2d
-; CHECK-NEXT: [[M21:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
-; CHECK-NEXT: [[M22:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: [[M11:%3:vr64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: [[M12:%4:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB %3:vr64bit, %4:vr64bit, implicit $fpc
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed [[DIV]]
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo [[M11]], [[M12]], killed %6:vr64bit
-; CHECK-NEXT: $f0d = COPY %7:vr64bit
-; CHECK-NEXT: Return implicit $f0d
-
-; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun0_fma2_divop: IsSSA, TracksLiveness
-; CHECK: %8:vr64bit = {{.*}} WFMDB killed [[M21]], killed [[M22]]
-; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo [[M11]], [[M12]], %8:vr64bit
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo killed [[DIV]], %9:vr64bit
-entry:
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-
- %0 = load double, ptr %x
- %1 = load double, ptr %arrayidx1
- %2 = load double, ptr %arrayidx2
- %3 = load double, ptr %arrayidx4
- %div = fdiv double %2, %3
-
- %mul1 = fmul reassoc nsz contract double %0, %1
- %mul2 = fmul reassoc nsz contract double %2, %3
-
- %A1 = fadd reassoc nsz contract double %div, %mul1
- %A2 = fadd reassoc nsz contract double %A1, %mul2
-
- ret double %A2
-}
-
-; The non-profitable case:
-define double @fun1_fma2(ptr %x, double %Arg) {
-; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun1_fma2: IsSSA, TracksLiveness
-; CHECK: bb.0.entry:
-; CHECK-NEXT: liveins: $r2d, $f0d
-; CHECK-NEXT: %1:fp64bit = COPY $f0d
-; CHECK-NEXT: %0:addr64bit = COPY $r2d
-; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
-; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %6:vr64bit
-; CHECK-NEXT: $f0d = COPY %7:vr64bit
-; CHECK-NEXT: Return implicit $f0d
-entry:
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-
- %0 = load double, ptr %x
- %1 = load double, ptr %arrayidx1
- %2 = load double, ptr %arrayidx2
- %3 = load double, ptr %arrayidx4
-
- %mul1 = fmul reassoc nsz contract double %0, %1
- %mul2 = fmul reassoc nsz contract double %2, %3
-
- %A1 = fadd reassoc nsz contract double %Arg, %mul1
- %A2 = fadd reassoc nsz contract double %A1, %mul2
-
- ret double %A2
-}
-
-; Keep the two FMAs, but change order due to the long latency divide.
-define double @fun2_fma2(ptr %x) {
-; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun2_fma2: IsSSA, TracksLiveness
-; CHECK: bb.0.entry:
-; CHECK-NEXT: liveins: $r2d
-; CHECK-NEXT: %0:addr64bit = COPY $r2d
-; CHECK-NEXT: %1:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
-; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB %3:vr64bit, %4:vr64bit, implicit $fpc
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed %1:vr64bit, killed [[DIV]], killed %2:vr64bit
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo %3:vr64bit, %4:vr64bit, killed %6:vr64bit
-
-; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun2_fma2: IsSSA, TracksLiveness
-; CHECK: %12:vr64bit = {{.*}} WFMADB_CCPseudo %3:vr64bit, %4:vr64bit, killed %2:vr64bit
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed %1:vr64bit, killed [[DIV]], %12:vr64bit
-
-entry:
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-
- %0 = load double, ptr %x
- %1 = load double, ptr %arrayidx1
- %2 = load double, ptr %arrayidx2
- %3 = load double, ptr %arrayidx4
- %div = fdiv double %2, %3
-
- %mul1 = fmul reassoc nsz contract double %0, %div
- %mul2 = fmul reassoc nsz contract double %2, %3
-
- %A1 = fadd reassoc nsz contract double %1, %mul1
- %A2 = fadd reassoc nsz contract double %A1, %mul2
-
- ret double %A2
-}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
deleted file mode 100644
index b43ac9d4d528a4..00000000000000
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \
-; RUN: -print-before=machine-combiner -print-after=machine-combiner -z-fma \
-; RUN: 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
-; Test reassociation involving fma.
-
-; No improvement possible.
-define double @fun0_fma1add(ptr %x) {
-; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun0_fma1add: IsSSA, TracksLiveness
-; CHECK: bb.0.entry:
-; CHECK-NEXT: liveins: $r2d
-; CHECK-NEXT: %0:addr64bit = COPY $r2d
-; CHECK-NEXT: %1:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
-; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: %5:vr64bit = {{.*}} WFADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed %1:vr64bit, killed %2:vr64bit, killed %5:vr64bit
-; CHECK-NEXT: $f0d = COPY %6:vr64bit
-; CHECK-NEXT: Return implicit $f0d
-entry:
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-
- %0 = load double, ptr %x
- %1 = load double, ptr %arrayidx1
- %2 = load double, ptr %arrayidx2
- %3 = load double, ptr %arrayidx4
-
- %mul = fmul reassoc nsz contract double %0, %1
-
- %A1 = fadd reassoc nsz contract double %2, %3
- %A2 = fadd reassoc nsz contract double %A1, %mul
-
- ret double %A2
-}
-
-; The RHS of the Add is stalling, so move up the FMA to the LHS.
-define double @fun1_fma1add_divop(ptr %x) {
-; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun1_fma1add_divop: IsSSA, TracksLiveness
-; CHECK: bb.0.entry:
-; CHECK-NEXT: liveins: $r2d
-; CHECK-NEXT: %0:addr64bit = COPY $r2d
-; CHECK-NEXT: [[M21:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
-; CHECK-NEXT: [[M22:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: [[T1:%3:fp64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: [[DIV:%4:fp64bit]] = nofpexcept DDB %3:fp64bit(tied-def 0), %0:addr64bit, 24, $noreg
-; CHECK-NEXT: %5:vr64bit = {{.*}} WFADB_CCPseudo [[T1]], killed [[DIV]]
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %5:vr64bit
-; CHECK-NEXT: $f0d = COPY %6:vr64bit
-; CHECK-NEXT: Return implicit $f0d
-
-; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun1_fma1add_divop: IsSSA, TracksLiveness
-; CHECK: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[T1]]
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFADB_CCPseudo %7:vr64bit, killed [[DIV]]
-entry:
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-
- %0 = load double, ptr %x
- %1 = load double, ptr %arrayidx1
- %2 = load double, ptr %arrayidx2
- %3 = load double, ptr %arrayidx4
- %div = fdiv double %2, %3
-
- %mul = fmul reassoc nsz contract double %0, %1
-
- %A1 = fadd reassoc nsz contract double %2, %div
- %A2 = fadd reassoc nsz contract double %A1, %mul
-
- ret double %A2
-}
-
-; The LHS of the Add is stalling, so move up the FMA to the RHS.
-define double @fun2_fma1add_divop(ptr %x) {
-; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun2_fma1add_divop: IsSSA, TracksLiveness
-; CHECK: bb.0.entry:
-; CHECK-NEXT: liveins: $r2d
-; CHECK-NEXT: %0:addr64bit = COPY $r2d
-; CHECK-NEXT: [[M21:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
-; CHECK-NEXT: [[M22:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: [[T2:%4:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB killed %3:vr64bit, %4:vr64bit, implicit $fpc
-; CHECK-NEXT: %6:vr64bit = {{.*}} WFADB_CCPseudo killed [[DIV]], [[T2]]
-; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %6:vr64bit
-
-; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun2_fma1add_divop: IsSSA, TracksLiveness
-; CHECK: %9:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[T2]]
-; CHECK: %7:vr64bit = {{.*}} WFADB_CCPseudo %9:vr64bit, killed [[DIV]]
-entry:
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
-
- %0 = load double, ptr %x
- %1 = load double, ptr %arrayidx1
- %2 = load double, ptr %arrayidx2
- %3 = load double, ptr %arrayidx4
- %div = fdiv double %2, %3
-
- %mul = fmul reassoc nsz contract double %0, %1
-
- %A1 = fadd reassoc nsz contract double %div, %3
- %A2 = fadd reassoc nsz contract double %A1, %mul
-
- ret double %A2
-}
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
deleted file mode 100644
index 8bacf8eec64965..00000000000000
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll
+++ /dev/null
@@ -1,175 +0,0 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 -print-before=machine-combiner \
-; RUN: -print-after=machine-combiner -debug-only=machine-combiner,systemz-II -z-fma 2>&1 \
-; RUN: | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 \
-; RUN: -print-after=machine-combiner -debug-only=machine-combiner,systemz-II -ppc-fma 2>&1 \
-; RUN: | FileCheck %s --check-prefix=ALT
-; REQUIRES: asserts
-
-; Test transformation of a sequence of 8 FMAs, with different patterns.
-
-define double @fun_fma8(ptr %x, double %A) {
-; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***:
-; CHECK-NEXT: # Machine code for function fun_fma8: IsSSA, TracksLiveness
-; CHECK: bb.0.entry:
-; CHECK-NEXT: liveins: $r2d, $f0d
-; CHECK-NEXT: %1:fp64bit = COPY $f0d
-; CHECK-NEXT: %0:addr64bit = COPY $r2d
-; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x)
-; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1)
-; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2)
-; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4)
-; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6)
-; CHECK-NEXT: %7:vr64bit = VL64 %0:addr64bit, 40, $noreg :: (load (s64) from %ir.arrayidx8)
-; CHECK-NEXT: %8:vr64bit = VL64 %0:addr64bit, 48, $noreg :: (load (s64) from %ir.arrayidx10)
-; CHECK-NEXT: %9:vr64bit = VL64 %0:addr64bit, 56, $noreg :: (load (s64) from %ir.arrayidx12)
-; CHECK-NEXT: %10:vr64bit = VL64 %0:addr64bit, 64, $noreg :: (load (s64) from %ir.arrayidx14)
-; CHECK-NEXT: %11:vr64bit = VL64 %0:addr64bit, 72, $noreg :: (load (s64) from %ir.arrayidx16)
-; CHECK-NEXT: %12:vr64bit = VL64 %0:addr64bit, 80, $noreg :: (load (s64) from %ir.arrayidx18)
-; CHECK-NEXT: %13:vr64bit = VL64 %0:addr64bit, 88, $noreg :: (load (s64) from %ir.arrayidx20)
-; CHECK-NEXT: %14:vr64bit = VL64 %0:addr64bit, 96, $noreg :: (load (s64) from %ir.arrayidx22)
-; CHECK-NEXT: %15:vr64bit = VL64 %0:addr64bit, 104, $noreg :: (load (s64) from %ir.arrayidx24)
-; CHECK-NEXT: %16:vr64bit = VL64 %0:addr64bit, 112, $noreg :: (load (s64) from %ir.arrayidx26)
-; CHECK-NEXT: %17:vr64bit = VL64 %0:addr64bit, 120, $noreg :: (load (s64) from %ir.arrayidx28)
-; CHECK-NEXT: %18:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
-; CHECK-NEXT: %19:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %18:vr64bit
-; CHECK-NEXT: %20:vr64bit = {{.*}} WFMADB_CCPseudo killed %6:vr64bit, killed %7:vr64bit, killed %19:vr64bit
-; CHECK-NEXT: %21:vr64bit = {{.*}} WFMADB_CCPseudo killed %8:vr64bit, killed %9:vr64bit, killed %20:vr64bit
-; CHECK-NEXT: %22:vr64bit = {{.*}} WFMADB_CCPseudo killed %10:vr64bit, killed %11:vr64bit, killed %21:vr64bit
-; CHECK-NEXT: %23:vr64bit = {{.*}} WFMADB_CCPseudo killed %12:vr64bit, killed %13:vr64bit, killed %22:vr64bit
-; CHECK-NEXT: %24:vr64bit = {{.*}} WFMADB_CCPseudo killed %14:vr64bit, killed %15:vr64bit, killed %23:vr64bit
-; CHECK-NEXT: %25:vr64bit = {{.*}} WFMADB_CCPseudo killed %16:vr64bit, killed %17:vr64bit, killed %24:vr64bit
-; CHECK-NEXT: $f0d = COPY %25:vr64bit
-; CHECK-NEXT: Return implicit $f0d
-
-; CHECK: Machine InstCombiner: fun_fma8
-; CHECK: add pattern FMA2_P1P0
-; CHECK-NEXT: add pattern FMA2_P0P1
-; CHECK-NEXT: add pattern FMA2
-; CHECK: reassociating using pattern FMA_P1P0
-; CHECK: Dependence data for %21:vr64bit = {{.*}} WFMADB_CCPseudo
-; CHECK-NEXT: NewRootDepth: 16 RootDepth: 22 It MustReduceDepth and it does it
-; CHECK-NEXT: Resource length before replacement: 16 and after: 16
-; CHECK-NEXT: As result it IMPROVES/PRESERVES Resource Length
-; CHECK: add pattern FMA2_P1P0
-; CHECK-NEXT: add pattern FMA2_P0P1
-; CHECK-NEXT: add pattern FMA2
-; CHECK-NEXT: reassociating using pattern FMA_P1P0
-; CHECK-NEXT: Dependence data for %23:vr64bit = {{.*}} WFMADB_CCPseudo
-; CHECK-NEXT: NewRootDepth: 22 RootDepth: 28 It MustReduceDepth and it does it
-; CHECK: Resource length before replacement: 16 and after: 16
-; CHECK-NEXT: As result it IMPROVES/PRESERVES Resource Length
-; CHECK-NEXT: add pattern FMA1_Add_L
-; CHECK-NEXT: add pattern FMA1_Add_R
-; CHECK-NEXT: reassociating using pattern FMA1_Add_L
-; CHECK-NEXT: Dependence data for %24:vr64bit = {{.*}} WFMADB_CCPseudo
-; CHECK-NEXT: NewRootDepth: 28 RootDepth: 28 It MustReduceDepth but it does NOT do it
-; CHECK-NEXT: reassociating using pattern FMA1_Add_R
-; CHECK-NEXT: Dependence data for %24:vr64bit = {{.*}} WFMADB_CCPseudo
-; CHECK-NEXT: NewRootDepth: 22 RootDepth: 28 It MustReduceDepth and it does it
-; CHECK-NEXT: Resource length before replacement: 16 and after: 16
-; CHECK-NEXT: As result it IMPROVES/PRESERVES Resource Length
-
-; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; CHECK: %18:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
-; CHECK-NEXT: %19:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %18:vr64bit
-; CHECK-NEXT: %36:vr64bit = {{.*}} WFMDB killed %6:vr64bit, killed %7:vr64bit
-; CHECK-NEXT: %37:vr64bit = {{.*}} WFMADB_CCPseudo killed %8:vr64bit, killed %9:vr64bit, %36:vr64bit
-; CHECK-NEXT: %21:vr64bit = {{.*}} WFADB_CCPseudo killed %19:vr64bit, %37:vr64bit
-; CHECK-NEXT: %40:vr64bit = {{.*}} WFMDB killed %10:vr64bit, killed %11:vr64bit
-; CHECK-NEXT: %41:vr64bit = {{.*}} WFMADB_CCPseudo killed %12:vr64bit, killed %13:vr64bit, %40:vr64bit
-; CHECK-NEXT: %43:vr64bit = {{.*}} WFMADB_CCPseudo killed %14:vr64bit, killed %15:vr64bit, %41:vr64bit
-; CHECK-NEXT: %24:vr64bit = {{.*}} WFADB_CCPseudo %43:vr64bit, killed %21:vr64bit
-; CHECK-NEXT: %25:vr64bit = {{.*}} WFMADB_CCPseudo killed %16:vr64bit, killed %17:vr64bit, killed %24:vr64bit
-
-; ALT: Machine InstCombiner: fun_fma8
-; ALT-NEXT: Combining MBB entry
-; ALT-NEXT: add pattern FMA3
-; ALT-NEXT: reassociating using pattern FMA3
-; ALT-NEXT: Dependence data for %20:vr64bit = {{.*}} WFMADB_CCPseudo
-; ALT-NEXT: NewRootDepth: 16 RootDepth: 16 It MustReduceDepth but it does NOT do it
-; ALT-NEXT: add pattern FMA3
-; ALT-NEXT: reassociating using pattern FMA3
-; ALT-NEXT: Dependence data for %21:vr64bit = {{.*}} WFMADB_CCPseudo
-; ALT-NEXT: NewRootDepth: 16 RootDepth: 22 It MustReduceDepth and it does it
-; ALT-NEXT: Resource length before replacement: 16 and after: 16
-; ALT-NEXT: As result it IMPROVES/PRESERVES Resource Length
-; ALT-NEXT: add pattern FMA2_Add
-; ALT-NEXT: reassociating using pattern FMA2_Add
-; ALT-NEXT: Dependence data for %23:vr64bit = {{.*}} WFMADB_CCPseudo
-; ALT-NEXT: NewRootDepth: 22 RootDepth: 28 It MustReduceDepth and it does it
-; ALT-NEXT: Resource length before replacement: 16 and after: 16
-; ALT-NEXT: As result it IMPROVES/PRESERVES Resource Length
-; ALT-NEXT: add pattern FMA2_Add
-; ALT-NEXT: reassociating using pattern FMA2_Add
-; ALT-NEXT: Dependence data for %25:vr64bit = {{.*}} WFMADB_CCPseudo
-; ALT-NEXT: NewRootDepth: 28 RootDepth: 34 It MustReduceDepth and it does it
-; ALT-NEXT: Resource length before replacement: 16 and after: 16
-; ALT-NEXT: As result it IMPROVES/PRESERVES Resource Length
-
-; ALT: # *** IR Dump After Machine InstCombiner (machine-combiner) ***:
-; ALT: %18:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit
-; ALT-NEXT: %29:vr64bit = {{.*}} WFMDB killed %4:vr64bit, killed %5:vr64bit
-; ALT-NEXT: %30:vr64bit = {{.*}} WFMADB_CCPseudo killed %6:vr64bit, killed %7:vr64bit, killed %18:vr64bit
-; ALT-NEXT: %31:vr64bit = {{.*}} WFMADB_CCPseudo killed %8:vr64bit, killed %9:vr64bit, %29:vr64bit
-; ALT-NEXT: %32:vr64bit = {{.*}} WFMADB_CCPseudo killed %10:vr64bit, killed %11:vr64bit, %30:vr64bit
-; ALT-NEXT: %33:vr64bit = {{.*}} WFMADB_CCPseudo killed %12:vr64bit, killed %13:vr64bit, %31:vr64bit
-; ALT-NEXT: %34:vr64bit = {{.*}} WFMADB_CCPseudo killed %14:vr64bit, killed %15:vr64bit, %32:vr64bit
-; ALT-NEXT: %35:vr64bit = {{.*}} WFMADB_CCPseudo killed %16:vr64bit, killed %17:vr64bit, %33:vr64bit
-; ALT-NEXT: %25:vr64bit = {{.*}} WFADB_CCPseudo %34:vr64bit, %35:vr64bit
-
-entry:
- %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
- %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2
- %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3
- %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4
- %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5
- %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6
- %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7
- %arrayidx14 = getelementptr inbounds double, ptr %x, i64 8
- %arrayidx16 = getelementptr inbounds double, ptr %x, i64 9
- %arrayidx18 = getelementptr inbounds double, ptr %x, i64 10
- %arrayidx20 = getelementptr inbounds double, ptr %x, i64 11
- %arrayidx22 = getelementptr inbounds double, ptr %x, i64 12
- %arrayidx24 = getelementptr inbounds double, ptr %x, i64 13
- %arrayidx26 = getelementptr inbounds double, ptr %x, i64 14
- %arrayidx28 = getelementptr inbounds double, ptr %x, i64 15
-
- %0 = load double, ptr %x
- %1 = load double, ptr %arrayidx1
- %2 = load double, ptr %arrayidx2
- %3 = load double, ptr %arrayidx4
- %4 = load double, ptr %arrayidx6
- %5 = load double, ptr %arrayidx8
- %6 = load double, ptr %arrayidx10
- %7 = load double, ptr %arrayidx12
- %8 = load double, ptr %arrayidx14
- %9 = load double, ptr %arrayidx16
- %10 = load double, ptr %arrayidx18
- %11 = load double, ptr %arrayidx20
- %12 = load double, ptr %arrayidx22
- %13 = load double, ptr %arrayidx24
- %14 = load double, ptr %arrayidx26
- %15 = load double, ptr %arrayidx28
-
- %mul1 = fmul reassoc nsz contract double %0, %1
- %mul2 = fmul reassoc nsz contract double %2, %3
- %mul3 = fmul reassoc nsz contract double %4, %5
- %mul4 = fmul reassoc nsz contract double %6, %7
- %mul5 = fmul reassoc nsz contract double %8, %9
- %mul6 = fmul reassoc nsz contract double %10, %11
- %mul7 = fmul reassoc nsz contract double %12, %13
- %mul8 = fmul reassoc nsz contract double %14, %15
-
- %A1 = fadd reassoc nsz contract double %A, %mul1
- %A2 = fadd reassoc nsz contract double %A1, %mul2
- %A3 = fadd reassoc nsz contract double %A2, %mul3
- %A4 = fadd reassoc nsz contract double %A3, %mul4
- %A5 = fadd reassoc nsz contract double %A4, %mul5
- %A6 = fadd reassoc nsz contract double %A5, %mul6
- %A7 = fadd reassoc nsz contract double %A6, %mul7
- %A8 = fadd reassoc nsz contract double %A7, %mul8
-
- ret double %A8
-}
-
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll
similarity index 100%
rename from llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll
rename to llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll
>From 4df14d395223f03989da80af3c50967bc9e1dde8 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 19 Apr 2024 09:13:39 +0200
Subject: [PATCH 6/9] Try without CC pseudos
---
llvm/lib/Target/SystemZ/CMakeLists.txt | 1 -
llvm/lib/Target/SystemZ/SystemZ.h | 2 -
.../SystemZ/SystemZFinalizeReassociation.cpp | 96 ----------------
.../lib/Target/SystemZ/SystemZInstrFormats.td | 18 ---
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 82 +++++++-------
llvm/lib/Target/SystemZ/SystemZInstrInfo.h | 3 -
llvm/lib/Target/SystemZ/SystemZInstrVector.td | 32 +++---
llvm/lib/Target/SystemZ/SystemZOperators.td | 9 --
llvm/lib/Target/SystemZ/SystemZScheduleZ13.td | 2 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ14.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ15.td | 4 +-
llvm/lib/Target/SystemZ/SystemZScheduleZ16.td | 4 +-
.../Target/SystemZ/SystemZTargetMachine.cpp | 1 -
llvm/test/CodeGen/SystemZ/foldmem-peep.mir | 105 ++++++++++++++++++
14 files changed, 167 insertions(+), 196 deletions(-)
delete mode 100644 llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp
create mode 100644 llvm/test/CodeGen/SystemZ/foldmem-peep.mir
diff --git a/llvm/lib/Target/SystemZ/CMakeLists.txt b/llvm/lib/Target/SystemZ/CMakeLists.txt
index b8f07d1222722c..063e5bcd44171e 100644
--- a/llvm/lib/Target/SystemZ/CMakeLists.txt
+++ b/llvm/lib/Target/SystemZ/CMakeLists.txt
@@ -20,7 +20,6 @@ add_llvm_target(SystemZCodeGen
SystemZConstantPoolValue.cpp
SystemZCopyPhysRegs.cpp
SystemZElimCompare.cpp
- SystemZFinalizeReassociation.cpp
SystemZFrameLowering.cpp
SystemZHazardRecognizer.cpp
SystemZISelDAGToDAG.cpp
diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h
index 49a200babfff57..d7aa9e4e18cbbb 100644
--- a/llvm/lib/Target/SystemZ/SystemZ.h
+++ b/llvm/lib/Target/SystemZ/SystemZ.h
@@ -195,14 +195,12 @@ FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM);
-FunctionPass *createSystemZFinalizeReassociationPass(SystemZTargetMachine &TM);
FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM);
FunctionPass *createSystemZTDCPass();
void initializeSystemZCopyPhysRegsPass(PassRegistry &);
void initializeSystemZDAGToDAGISelPass(PassRegistry &);
void initializeSystemZElimComparePass(PassRegistry &);
-void initializeSystemZFinalizeReassociationPass(PassRegistry &);
void initializeSystemZLDCleanupPass(PassRegistry &);
void initializeSystemZLongBranchPass(PassRegistry &);
void initializeSystemZPostRewritePass(PassRegistry &);
diff --git a/llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp b/llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp
deleted file mode 100644
index c98ef2df214b78..00000000000000
--- a/llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-//===----- SystemZFinalizeReassociation.cpp - Finalize reassociation ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass is the last step of the process of enabling reassociation with
-// the MachineCombiner. These are the steps involved:
-//
-// 1. Instruction selection: Disable reg/mem folding for any operations that
-// are reassociable since MachineCombiner will not succeed otherwise.
-// Select a reg/reg pseudo that pretends to clobber CC if the reg/mem
-// opcode clobbers it.
-//
-// 2. MachineCombiner: reassociation with the reg/reg instructions.
-//
-// 3. PeepholeOptimizer: Fold loads and reg/reg pseudos into reg/mem
-// instructions.
-//
-// 4. This pass: Convert any remaining reg/reg pseudos.
-//
-//===----------------------------------------------------------------------===//
-
-#include "SystemZTargetMachine.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-
-using namespace llvm;
-
-namespace {
-
-class SystemZFinalizeReassociation : public MachineFunctionPass {
-public:
- static char ID;
- SystemZFinalizeReassociation() : MachineFunctionPass(ID), TII(nullptr) {
- initializeSystemZFinalizeReassociationPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-private:
- bool visitMBB(MachineBasicBlock &MBB);
-
- const SystemZInstrInfo *TII;
-};
-
-char SystemZFinalizeReassociation::ID = 0;
-
-} // end anonymous namespace
-
-INITIALIZE_PASS(SystemZFinalizeReassociation, "systemz-finalize-reassoc",
- "SystemZ Finalize Reassociation", false, false)
-
-FunctionPass *
-llvm::createSystemZFinalizeReassociationPass(SystemZTargetMachine &TM) {
- return new SystemZFinalizeReassociation();
-}
-
-void SystemZFinalizeReassociation::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-bool SystemZFinalizeReassociation::visitMBB(MachineBasicBlock &MBB) {
- bool Changed = false;
- for (MachineInstr &MI : MBB) {
- unsigned PseudoOpcode = MI.getOpcode();
- unsigned TargetOpcode =
- PseudoOpcode == SystemZ::WFADB_CCPseudo ? SystemZ::WFADB
- : PseudoOpcode == SystemZ::WFASB_CCPseudo ? SystemZ::WFASB
- : PseudoOpcode == SystemZ::WFSDB_CCPseudo ? SystemZ::WFSDB
- : PseudoOpcode == SystemZ::WFSSB_CCPseudo ? SystemZ::WFSSB
- : 0;
- if (TargetOpcode) {
- MI.setDesc(TII->get(TargetOpcode));
- int CCIdx = MI.findRegisterDefOperandIdx(SystemZ::CC, /*isDead=*/true);
- assert(CCIdx != -1 && "Expected dead CC-def.");
- MI.removeOperand(CCIdx);
- Changed = true;
- }
- }
- return Changed;
-}
-
-bool SystemZFinalizeReassociation::runOnMachineFunction(MachineFunction &F) {
- TII = F.getSubtarget<SystemZSubtarget>().getInstrInfo();
-
- bool Modified = false;
- for (auto &MBB : F)
- Modified |= visitMBB(MBB);
-
- return Modified;
-}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 62a7d93106bc68..3dba33b66bf4f4 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5536,21 +5536,3 @@ multiclass StringRRE<string mnemonic, bits<16> opcode,
[(set GR64:$end, (operator GR64:$start1, GR64:$start2,
GR32:$char))]>;
}
-
-// Duplicate the instruction with a pseudo that defines CC that will be
-// selected in cases where reassociation is enabled. The CC operand is needed
-// in order to do later reg/mem folding into instructions that clobber CC.
-multiclass BinaryVRRcAndCCPseudo<string mnemonic, bits<16> opcode,
- SDPatternOperator operator,
- SDPatternOperator reassoc_operator,
- TypedReg tr1, TypedReg tr2, bits<4> type = 0,
- bits<4> m5 = 0, bits<4> m6 = 0,
- string fp_mnemonic = ""> {
- def "" : BinaryVRRc<mnemonic, opcode, operator, tr1, tr2, type, m5, m6,
- fp_mnemonic>;
- let Defs = [CC], AddedComplexity = 1 in // Win over "".
- def _CCPseudo : Pseudo<(outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
- [(set (tr1.vt tr1.op:$V1),
- (reassoc_operator (tr2.vt tr2.op:$V2),
- (tr2.vt tr2.op:$V3)))]>;
-}
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index dee1b365e029d9..949fcc148d2210 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -629,6 +629,7 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+ MachineBasicBlock *MBB = MI.getParent();
// Check whether we can move the DefMI load, and that it only has one use.
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
@@ -641,21 +642,23 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
// For reassociable FP operations, any loads have been purposefully left
// unfolded so that MachineCombiner can do its work on reg/reg
// opcodes. After that, as many loads as possible are now folded.
+ // TODO: This may be beneficial with other opcodes as well as machine-sink
+ // can move loads close to their user in a different MBB.
unsigned LoadOpc = 0;
unsigned RegMemOpcode = 0;
const TargetRegisterClass *FPRC = nullptr;
- RegMemOpcode = MI.getOpcode() == SystemZ::WFADB_CCPseudo ? SystemZ::ADB
- : MI.getOpcode() == SystemZ::WFSDB_CCPseudo ? SystemZ::SDB
- : MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
- : 0;
+ RegMemOpcode = MI.getOpcode() == SystemZ::WFADB ? SystemZ::ADB
+ : MI.getOpcode() == SystemZ::WFSDB ? SystemZ::SDB
+ : MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
+ : 0;
if (RegMemOpcode) {
LoadOpc = SystemZ::VL64;
FPRC = &SystemZ::FP64BitRegClass;
} else {
- RegMemOpcode = MI.getOpcode() == SystemZ::WFASB_CCPseudo ? SystemZ::AEB
- : MI.getOpcode() == SystemZ::WFSSB_CCPseudo ? SystemZ::SEB
- : MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
- : 0;
+ RegMemOpcode = MI.getOpcode() == SystemZ::WFASB ? SystemZ::AEB
+ : MI.getOpcode() == SystemZ::WFSSB ? SystemZ::SEB
+ : MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
+ : 0;
if (RegMemOpcode) {
LoadOpc = SystemZ::VL32;
FPRC = &SystemZ::FP32BitRegClass;
@@ -663,9 +666,24 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
}
if (!RegMemOpcode || DefMI->getOpcode() != LoadOpc)
return nullptr;
- assert((MI.findRegisterDefOperandIdx(SystemZ::CC) == -1 ||
- MI.findRegisterDefOperandIdx(SystemZ::CC, /*isDead=*/true) != -1) &&
- "Expected dead CC-def on add/sub pseudo instruction.");
+
+ // If RegMemOpcode clobbers CC, first make sure CC is not live at this point.
+ if (get(RegMemOpcode).hasImplicitDefOfPhysReg(SystemZ::CC)) {
+ assert(DefMI->getParent() == MI.getParent() && "Assuming a local fold.");
+ for (MachineBasicBlock::iterator MII = std::prev(MI.getIterator());;
+ --MII) {
+ if (MII->definesRegister(SystemZ::CC)) {
+ if (!MII->registerDefIsDead(SystemZ::CC))
+ return nullptr;
+ break;
+ }
+ if (MII == MBB->begin()) {
+ if (MBB->isLiveIn(SystemZ::CC))
+ return nullptr;
+ break;
+ }
+ }
+ }
Register DstReg = MI.getOperand(0).getReg();
MachineOperand LHS = MI.getOperand(1);
@@ -1073,25 +1091,6 @@ SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
return nullptr;
}
-void SystemZInstrInfo::finalizeInsInstrs(
- MachineInstr &Root, unsigned &P,
- SmallVectorImpl<MachineInstr *> &InsInstrs) const {
- const TargetRegisterInfo *TRI =
- Root.getParent()->getParent()->getSubtarget().getRegisterInfo();
- for (auto *Inst : InsInstrs) {
- switch (Inst->getOpcode()) {
- case SystemZ::WFADB_CCPseudo:
- case SystemZ::WFASB_CCPseudo:
- case SystemZ::WFSDB_CCPseudo:
- case SystemZ::WFSSB_CCPseudo:
- Inst->addRegisterDead(SystemZ::CC, TRI);
- break;
- default:
- break;
- }
- }
-}
-
bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
bool Invert) const {
unsigned Opc = Inst.getOpcode();
@@ -1106,11 +1105,8 @@ bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
default:
break;
// Adds and multiplications.
- case SystemZ::WFADB_CCPseudo:
- case SystemZ::WFASB_CCPseudo:
- assert(Inst.findRegisterDefOperandIdx(SystemZ::CC, /*isDead=*/true) != -1 &&
- "Expected dead CC-def on add/sub pseudo instruction.");
- LLVM_FALLTHROUGH;
+ case SystemZ::WFADB:
+ case SystemZ::WFASB:
case SystemZ::WFAXB:
case SystemZ::VFADB:
case SystemZ::VFASB:
@@ -1130,10 +1126,10 @@ std::optional<unsigned>
SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const {
// fadd => fsub
switch (Opcode) {
- case SystemZ::WFADB_CCPseudo:
- return SystemZ::WFSDB_CCPseudo;
- case SystemZ::WFASB_CCPseudo:
- return SystemZ::WFSSB_CCPseudo;
+ case SystemZ::WFADB:
+ return SystemZ::WFSDB;
+ case SystemZ::WFASB:
+ return SystemZ::WFSSB;
case SystemZ::WFAXB:
return SystemZ::WFSXB;
case SystemZ::VFADB:
@@ -1141,10 +1137,10 @@ SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const {
case SystemZ::VFASB:
return SystemZ::VFSSB;
// fsub => fadd
- case SystemZ::WFSDB_CCPseudo:
- return SystemZ::WFADB_CCPseudo;
- case SystemZ::WFSSB_CCPseudo:
- return SystemZ::WFASB_CCPseudo;
+ case SystemZ::WFSDB:
+ return SystemZ::WFADB;
+ case SystemZ::WFSSB:
+ return SystemZ::WFASB;
case SystemZ::WFSXB:
return SystemZ::WFAXB;
case SystemZ::VFSDB:
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index b0511407b6cd11..38f38e3e859876 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -292,9 +292,6 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
LiveIntervals *LIS) const override;
bool useMachineCombiner() const override { return true; }
- void
- finalizeInsInstrs(MachineInstr &Root, unsigned &P,
- SmallVectorImpl<MachineInstr *> &InsInstrs) const override;
bool isAssociativeAndCommutative(const MachineInstr &Inst,
bool Invert) const override;
std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 965da938fb2f6e..c29c54a6cb79de 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1061,15 +1061,15 @@ multiclass VectorRounding<Instruction insn, TypedReg tr> {
let Predicates = [FeatureVector] in {
// Add.
let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
- def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
- def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>;
- defm WFADB : BinaryVRRcAndCCPseudo<"wfadb", 0xE7E3, any_fadd,
- z_fadd_reassoc, v64db, v64db, 3, 8, 0, "adbr">;
+ def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
+ def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>;
+ def WFADB : BinaryVRRc<"wfadb", 0xE7E3, any_fadd, v64db, v64db, 3, 8, 0,
+ "adbr">;
let Predicates = [FeatureVectorEnhancements1] in {
- def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>;
- defm WFASB : BinaryVRRcAndCCPseudo<"wfasb", 0xE7E3, any_fadd,
- z_fadd_reassoc, v32sb, v32sb, 2, 8, 0, "aebr">;
- def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>;
+ def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>;
+ def WFASB : BinaryVRRc<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, 2, 8, 0,
+ "aebr">;
+ def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>;
}
}
@@ -1389,15 +1389,15 @@ let Predicates = [FeatureVector] in {
// Subtract.
let Uses = [FPC], mayRaiseFPException = 1 in {
- def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
- def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>;
- defm WFSDB : BinaryVRRcAndCCPseudo<"wfsdb", 0xE7E2, any_fsub,
- z_fsub_reassoc, v64db, v64db, 3, 8, 0, "sdbr">;
+ def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
+ def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>;
+ def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, any_fsub, v64db, v64db, 3, 8, 0,
+ "sdbr">;
let Predicates = [FeatureVectorEnhancements1] in {
- def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>;
- defm WFSSB : BinaryVRRcAndCCPseudo<"wfssb", 0xE7E2, any_fsub,
- z_fsub_reassoc, v32sb, v32sb, 2, 8, 0, "sebr">;
- def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>;
+ def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>;
+ def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb, 2, 8, 0,
+ "sebr">;
+ def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>;
}
}
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index e02f52426526a5..6cb89ccff85e68 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -841,15 +841,6 @@ def z_any_fmul_noreassoc : PatFrag<(ops node:$src1, node:$src2),
(any_fmul node:$src1, node:$src2),
[{ return !shouldSelectForReassoc(N); }]>;
-// Floating-point operations which are reassociable, and therefore should be
-// selected as reg/reg instructions (no memop folding).
-def z_fadd_reassoc : PatFrag<(ops node:$src1, node:$src2),
- (fadd node:$src1, node:$src2),
- [{ return shouldSelectForReassoc(N); }]>;
-def z_fsub_reassoc : PatFrag<(ops node:$src1, node:$src2),
- (fsub node:$src1, node:$src2),
- [{ return shouldSelectForReassoc(N); }]>;
-
// Strict floating-point fragments.
def z_any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
[(z_strict_fcmp node:$lhs, node:$rhs),
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index 5975d884144a1a..9ce1a0d06b5afd 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -1344,7 +1344,7 @@ def : InstRW<[WLat3, WLat3, VecXsPm, NormalGr], (instregex "(V|W)FTCIDB$")>;
// Add / subtract
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)DB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
// Multiply / multiply-and-add/subtract
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>;
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
index f18d304d7a8a37..120d4a457ee396 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -1388,9 +1388,9 @@ def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
// Add / subtract
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>;
def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)SB$")>;
-def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>;
+def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
// Multiply / multiply-and-add/subtract
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
index 52fa35c5038ccf..acba3a1fd9919e 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ15.td
@@ -1431,9 +1431,9 @@ def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
// Add / subtract
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
// Multiply / multiply-and-add/subtract
diff --git a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
index 975671d1a24436..dd82b2b9b71e75 100644
--- a/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
+++ b/llvm/lib/Target/SystemZ/SystemZScheduleZ16.td
@@ -1437,9 +1437,9 @@ def : InstRW<[WLat3, WLat3, VecDFX, NormalGr], (instregex "WFTCIXB$")>;
// Add / subtract
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>;
def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>;
-def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>;
+def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>;
def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>;
// Multiply / multiply-and-add/subtract
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 3ffeb923930333..dced64d6b21ac7 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -259,7 +259,6 @@ bool SystemZPassConfig::addILPOpts() {
void SystemZPassConfig::addPreRegAlloc() {
addPass(createSystemZCopyPhysRegsPass(getSystemZTargetMachine()));
- addPass(createSystemZFinalizeReassociationPass(getSystemZTargetMachine()));
}
void SystemZPassConfig::addPostRewrite() {
diff --git a/llvm/test/CodeGen/SystemZ/foldmem-peep.mir b/llvm/test/CodeGen/SystemZ/foldmem-peep.mir
new file mode 100644
index 00000000000000..c6a244f4306161
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/foldmem-peep.mir
@@ -0,0 +1,105 @@
+# RUN: llc -mtriple=s390x-linux-gnu -mcpu=z16 -start-before=peephole-opt \
+# RUN: -stop-after=peephole-opt %s -o - | FileCheck %s
+
+--- |
+ define double @f1(ptr %x, i32 %a, i32 %b, i32 %limit, ptr %dst) #0 {
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ ret double 0.0
+ }
+ define double @f2(ptr %x, i32 %a, i32 %b, i32 %limit, ptr %dst) #0 {
+ %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1
+ ret double 0.0
+ }
+
+...
+
+# Do not fold where CC is live.
+# CHECK: name: f1
+# CHECK: {{.*}} WFADB
+---
+name: f1
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: addr64bit }
+ - { id: 1, class: gr32bit }
+ - { id: 2, class: gr32bit }
+ - { id: 3, class: gr32bit }
+ - { id: 4, class: addr64bit }
+ - { id: 5, class: vr64bit }
+ - { id: 6, class: vr64bit }
+ - { id: 7, class: vr64bit }
+ - { id: 8, class: grx32bit }
+liveins:
+ - { reg: '$r2d', virtual-reg: '%0' }
+ - { reg: '$r3l', virtual-reg: '%1' }
+ - { reg: '$r4l', virtual-reg: '%2' }
+ - { reg: '$r5l', virtual-reg: '%3' }
+ - { reg: '$r6d', virtual-reg: '%4' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $r2d, $r3l, $r4l, $r5l, $r6d
+
+ %4:addr64bit = COPY $r6d
+ %3:gr32bit = COPY $r5l
+ %2:gr32bit = COPY $r4l
+ %1:gr32bit = COPY $r3l
+ %0:addr64bit = COPY $r2d
+ CLFIMux %3, 42, implicit-def $cc
+ %5:vr64bit = VL64 %0, 0, $noreg :: (load (s64) from %ir.x)
+ %6:vr64bit = VL64 %0, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+ %7:vr64bit = nsz arcp contract afn reassoc nofpexcept WFADB killed %6, killed %5, implicit $fpc
+ %8:grx32bit = SELRMux %2, %1, 14, 4, implicit $cc
+ STMux killed %8, %4, 0, $noreg :: (store (s32) into %ir.dst)
+ $f0d = COPY %7
+ Return implicit $f0d
+
+...
+
+# Do not fold where CC is live in.
+# CHECK: name: f2
+# CHECK: {{.*}} WFADB
+---
+name: f2
+alignment: 16
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: addr64bit }
+ - { id: 1, class: gr32bit }
+ - { id: 2, class: gr32bit }
+ - { id: 3, class: gr32bit }
+ - { id: 4, class: addr64bit }
+ - { id: 5, class: vr64bit }
+ - { id: 6, class: vr64bit }
+ - { id: 7, class: vr64bit }
+ - { id: 8, class: grx32bit }
+liveins:
+ - { reg: '$r2d', virtual-reg: '%0' }
+ - { reg: '$r3l', virtual-reg: '%1' }
+ - { reg: '$r4l', virtual-reg: '%2' }
+ - { reg: '$r5l', virtual-reg: '%3' }
+ - { reg: '$r6d', virtual-reg: '%4' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0:
+ liveins: $r2d, $r3l, $r4l, $r5l, $r6d, $cc
+
+ %4:addr64bit = COPY $r6d
+ %3:gr32bit = COPY $r5l
+ %2:gr32bit = COPY $r4l
+ %1:gr32bit = COPY $r3l
+ %0:addr64bit = COPY $r2d
+ %5:vr64bit = VL64 %0, 0, $noreg :: (load (s64) from %ir.x)
+ %6:vr64bit = VL64 %0, 8, $noreg :: (load (s64) from %ir.arrayidx1)
+ %7:vr64bit = nsz arcp contract afn reassoc nofpexcept WFADB killed %6, killed %5, implicit $fpc
+ %8:grx32bit = SELRMux %2, %1, 14, 4, implicit $cc
+ STMux killed %8, %4, 0, $noreg :: (store (s32) into %ir.dst)
+ $f0d = COPY %7
+ Return implicit $f0d
+
+...
>From ab62cbd5d6c8a3fedb0a265579004ef29cc31b8f Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Fri, 19 Apr 2024 16:25:56 +0200
Subject: [PATCH 7/9] Try to use foldMemoryOperand from optimizeLoadInstr
---
llvm/include/llvm/CodeGen/TargetInstrInfo.h | 2 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 157 ++++++++++---------
llvm/lib/Target/SystemZ/SystemZInstrInfo.h | 2 +-
llvm/lib/Target/X86/X86InstrInfo.cpp | 2 +-
llvm/lib/Target/X86/X86InstrInfo.h | 2 +-
5 files changed, 86 insertions(+), 79 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index d09dc3a9577d64..d5b1df2114e9e7 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1710,7 +1710,7 @@ class TargetInstrInfo : public MCInstrInfo {
/// instruction that defines FoldAsLoadDefReg, and the function returns
/// the machine instruction generated due to folding.
virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI,
- MachineRegisterInfo *MRI,
+ const MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
return nullptr;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 949fcc148d2210..0ad4b577003f44 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -625,12 +625,9 @@ static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI,
}
MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
- MachineRegisterInfo *MRI,
+ const MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
- const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
- MachineBasicBlock *MBB = MI.getParent();
-
// Check whether we can move the DefMI load, and that it only has one use.
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
assert(DefMI);
@@ -639,76 +636,9 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
!MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
return nullptr;
- // For reassociable FP operations, any loads have been purposefully left
- // unfolded so that MachineCombiner can do its work on reg/reg
- // opcodes. After that, as many loads as possible are now folded.
- // TODO: This may be beneficial with other opcodes as well as machine-sink
- // can move loads close to their user in a different MBB.
- unsigned LoadOpc = 0;
- unsigned RegMemOpcode = 0;
- const TargetRegisterClass *FPRC = nullptr;
- RegMemOpcode = MI.getOpcode() == SystemZ::WFADB ? SystemZ::ADB
- : MI.getOpcode() == SystemZ::WFSDB ? SystemZ::SDB
- : MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
- : 0;
- if (RegMemOpcode) {
- LoadOpc = SystemZ::VL64;
- FPRC = &SystemZ::FP64BitRegClass;
- } else {
- RegMemOpcode = MI.getOpcode() == SystemZ::WFASB ? SystemZ::AEB
- : MI.getOpcode() == SystemZ::WFSSB ? SystemZ::SEB
- : MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
- : 0;
- if (RegMemOpcode) {
- LoadOpc = SystemZ::VL32;
- FPRC = &SystemZ::FP32BitRegClass;
- }
- }
- if (!RegMemOpcode || DefMI->getOpcode() != LoadOpc)
- return nullptr;
-
- // If RegMemOpcode clobbers CC, first make sure CC is not live at this point.
- if (get(RegMemOpcode).hasImplicitDefOfPhysReg(SystemZ::CC)) {
- assert(DefMI->getParent() == MI.getParent() && "Assuming a local fold.");
- for (MachineBasicBlock::iterator MII = std::prev(MI.getIterator());;
- --MII) {
- if (MII->definesRegister(SystemZ::CC)) {
- if (!MII->registerDefIsDead(SystemZ::CC))
- return nullptr;
- break;
- }
- if (MII == MBB->begin()) {
- if (MBB->isLiveIn(SystemZ::CC))
- return nullptr;
- break;
- }
- }
- }
-
- Register DstReg = MI.getOperand(0).getReg();
- MachineOperand LHS = MI.getOperand(1);
- MachineOperand RHS = MI.getOperand(2);
- MachineOperand &RegMO = RHS.getReg() == FoldAsLoadDefReg ? LHS : RHS;
- if ((RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB) &&
- FoldAsLoadDefReg != RHS.getReg())
- return nullptr;
-
- MachineOperand &Base = DefMI->getOperand(1);
- MachineOperand &Disp = DefMI->getOperand(2);
- MachineOperand &Indx = DefMI->getOperand(3);
- MachineInstrBuilder MIB =
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(RegMemOpcode), DstReg)
- .add(RegMO)
- .add(Base)
- .add(Disp)
- .add(Indx)
- .addMemOperand(*DefMI->memoperands_begin());
- MIB->addRegisterDead(SystemZ::CC, TRI);
- MRI->setRegClass(DstReg, FPRC);
- MRI->setRegClass(RegMO.getReg(), FPRC);
- transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);
-
- return MIB;
+ int UseOpIdx = MI.findRegisterUseOperandIdx(FoldAsLoadDefReg);
+ assert(UseOpIdx != -1 && "Expected FoldAsLoadDefReg to be used by MI.");
+ return foldMemoryOperand(MI, {((unsigned) UseOpIdx)}, *DefMI);
}
bool SystemZInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
@@ -1486,7 +1416,84 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
LiveIntervals *LIS) const {
- return nullptr;
+ MachineRegisterInfo *MRI = &MF.getRegInfo();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ // For reassociable FP operations, any loads have been purposefully left
+ // unfolded so that MachineCombiner can do its work on reg/reg
+ // opcodes. After that, as many loads as possible are now folded.
+ // TODO: This may be beneficial with other opcodes as well as machine-sink
+ // can move loads close to their user in a different MBB, which the isel
+ // matcher did not see.
+ unsigned LoadOpc = 0;
+ unsigned RegMemOpcode = 0;
+ const TargetRegisterClass *FPRC = nullptr;
+ RegMemOpcode = MI.getOpcode() == SystemZ::WFADB ? SystemZ::ADB
+ : MI.getOpcode() == SystemZ::WFSDB ? SystemZ::SDB
+ : MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
+ : 0;
+ if (RegMemOpcode) {
+ LoadOpc = SystemZ::VL64;
+ FPRC = &SystemZ::FP64BitRegClass;
+ } else {
+ RegMemOpcode = MI.getOpcode() == SystemZ::WFASB ? SystemZ::AEB
+ : MI.getOpcode() == SystemZ::WFSSB ? SystemZ::SEB
+ : MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
+ : 0;
+ if (RegMemOpcode) {
+ LoadOpc = SystemZ::VL32;
+ FPRC = &SystemZ::FP32BitRegClass;
+ }
+ }
+ if (!RegMemOpcode || LoadMI.getOpcode() != LoadOpc)
+ return nullptr;
+
+ // If RegMemOpcode clobbers CC, first make sure CC is not live at this point.
+ if (get(RegMemOpcode).hasImplicitDefOfPhysReg(SystemZ::CC)) {
+ assert(LoadMI.getParent() == MI.getParent() && "Assuming a local fold.");
+ assert(LoadMI != InsertPt && "Assuming InsertPt not to be first in MBB.");
+ for (MachineBasicBlock::iterator MII = std::prev(InsertPt);;
+ --MII) {
+ if (MII->definesRegister(SystemZ::CC)) {
+ if (!MII->registerDefIsDead(SystemZ::CC))
+ return nullptr;
+ break;
+ }
+ if (MII == MBB->begin()) {
+ if (MBB->isLiveIn(SystemZ::CC))
+ return nullptr;
+ break;
+ }
+ }
+ }
+
+ Register FoldAsLoadDefReg = LoadMI.getOperand(0).getReg();
+ // We don't really need Ops, but do a sanity check:
+ assert(Ops.size() == 1 && FoldAsLoadDefReg == MI.getOperand(Ops[0]).getReg() &&
+ "Expected MI to be the only user of the load.");
+ Register DstReg = MI.getOperand(0).getReg();
+ MachineOperand LHS = MI.getOperand(1);
+ MachineOperand RHS = MI.getOperand(2);
+ MachineOperand &RegMO = RHS.getReg() == FoldAsLoadDefReg ? LHS : RHS;
+ if ((RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB) &&
+ FoldAsLoadDefReg != RHS.getReg())
+ return nullptr;
+
+ MachineOperand &Base = LoadMI.getOperand(1);
+ MachineOperand &Disp = LoadMI.getOperand(2);
+ MachineOperand &Indx = LoadMI.getOperand(3);
+ MachineInstrBuilder MIB =
+ BuildMI(*MI.getParent(), InsertPt, MI.getDebugLoc(), get(RegMemOpcode), DstReg)
+ .add(RegMO)
+ .add(Base)
+ .add(Disp)
+ .add(Indx);
+ MIB->addRegisterDead(SystemZ::CC, &RI);
+ MRI->setRegClass(DstReg, FPRC);
+ MRI->setRegClass(RegMO.getReg(), FPRC);
+ transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);
+
+ return MIB;
}
bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 38f38e3e859876..aa10fb56496231 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -255,7 +255,7 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
ArrayRef<MachineOperand> Cond, Register TrueReg,
Register FalseReg) const override;
MachineInstr *optimizeLoadInstr(MachineInstr &MI,
- MachineRegisterInfo *MRI,
+ const MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const override;
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 07b79572cfd698..3d80c43b571f9c 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5499,7 +5499,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
/// register, the virtual register is used once in the same BB, and the
/// instructions in-between do not load or store, and have no side effects.
MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
- MachineRegisterInfo *MRI,
+ const MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
// Check whether we can move DefMI here.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index a6cae4622d922f..5407ede69a91ca 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -559,7 +559,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
const MachineRegisterInfo *MRI) const override;
MachineInstr *optimizeLoadInstr(MachineInstr &MI,
- MachineRegisterInfo *MRI,
+ const MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const override;
>From 1d13db68919a651c542a1812e99d0d058499c887 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Sat, 20 Apr 2024 09:53:03 +0200
Subject: [PATCH 8/9] Test updates + minor fixes.
---
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 29 ++++++++++----------
llvm/test/CodeGen/SystemZ/anyregcc.ll | 26 +++++++++---------
llvm/test/CodeGen/SystemZ/stackmap.ll | 4 +--
3 files changed, 29 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 0ad4b577003f44..c2f3688e521964 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -610,20 +610,6 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
.addImm(CCValid).addImm(CCMask);
}
-static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) {
- if (OldMI->registerDefIsDead(SystemZ::CC)) {
- MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC);
- if (CCDef != nullptr)
- CCDef->setIsDead(true);
- }
-}
-
-static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI,
- MachineInstr::MIFlag Flag) {
- if (OldMI->getFlag(Flag))
- NewMI->setFlag(Flag);
-}
-
MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
@@ -968,6 +954,19 @@ static LogicOp interpretAndImmediate(unsigned Opcode) {
}
}
+static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) {
+ if (OldMI->registerDefIsDead(SystemZ::CC)) {
+ MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC);
+ if (CCDef != nullptr)
+ CCDef->setIsDead(true);
+ }
+}
+
+static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI,
+ MachineInstr::MIFlag Flag) {
+ if (OldMI->getFlag(Flag))
+ NewMI->setFlag(Flag);
+}
MachineInstr *
SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
@@ -1470,7 +1469,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
Register FoldAsLoadDefReg = LoadMI.getOperand(0).getReg();
// We don't really need Ops, but do a sanity check:
assert(Ops.size() == 1 && FoldAsLoadDefReg == MI.getOperand(Ops[0]).getReg() &&
- "Expected MI to be the only user of the load.");
+ "Expected MI to have the only use of the load.");
Register DstReg = MI.getOperand(0).getReg();
MachineOperand LHS = MI.getOperand(1);
MachineOperand RHS = MI.getOperand(2);
diff --git a/llvm/test/CodeGen/SystemZ/anyregcc.ll b/llvm/test/CodeGen/SystemZ/anyregcc.ll
index 76b9352f30049e..8f477c929781cc 100644
--- a/llvm/test/CodeGen/SystemZ/anyregcc.ll
+++ b/llvm/test/CodeGen/SystemZ/anyregcc.ll
@@ -323,37 +323,37 @@ entry:
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
-; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .short 13
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
-; Loc 9: Register
-; CHECK-NEXT: .byte 1
+; Loc 9: IndirectMem
+; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
-; CHECK-NEXT: .long 0
-; Loc 10: Register
-; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .long 344
+; Loc 10: IndirectMem
+; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
-; CHECK-NEXT: .long 0
-; Loc 11: Register
-; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .long 352
+; Loc 11: IndirectMem
+; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
-; CHECK-NEXT: .long 0
-; Loc 12: Register
-; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .long 360
+; Loc 12: IndirectMem
+; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
-; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long 368
define i64 @anyreg_test2(ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5, ptr %a6, ptr %a7, ptr %a8, ptr %a9, ptr %a10, ptr %a11, ptr %a12) nounwind ssp uwtable {
entry:
%f = inttoptr i64 12297829382473034410 to ptr
diff --git a/llvm/test/CodeGen/SystemZ/stackmap.ll b/llvm/test/CodeGen/SystemZ/stackmap.ll
index 88c7336037c9c4..6156b7f2fc5a18 100644
--- a/llvm/test/CodeGen/SystemZ/stackmap.ll
+++ b/llvm/test/CodeGen/SystemZ/stackmap.ll
@@ -38,10 +38,10 @@
; CHECK-NEXT: .quad 160
; CHECK-NEXT: .quad 1
; CHECK-NEXT: .quad spilledValue
-; CHECK-NEXT: .quad 240
+; CHECK-NEXT: .quad 160
; CHECK-NEXT: .quad 1
; CHECK-NEXT: .quad spilledStackMapValue
-; CHECK-NEXT: .quad 200
+; CHECK-NEXT: .quad 160
; CHECK-NEXT: .quad 1
; CHECK-NEXT: .quad spillSubReg
; CHECK-NEXT: .quad 168
>From add84038dd93f7944aa8d4e9f30547d77912c3f6 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 30 Apr 2024 16:10:42 +0200
Subject: [PATCH 9/9] rebase + review updates
---
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 27 +++++++++++++-------
1 file changed, 18 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index c2f3688e521964..2b61cff727cdc7 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -622,9 +622,18 @@ MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
!MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
return nullptr;
- int UseOpIdx = MI.findRegisterUseOperandIdx(FoldAsLoadDefReg);
+ int UseOpIdx =
+ MI.findRegisterUseOperandIdx(FoldAsLoadDefReg, /*TRI=*/nullptr);
assert(UseOpIdx != -1 && "Expected FoldAsLoadDefReg to be used by MI.");
- return foldMemoryOperand(MI, {((unsigned) UseOpIdx)}, *DefMI);
+
+ // Check whether we can fold the load.
+ if (MachineInstr *FoldMI =
+ foldMemoryOperand(MI, {((unsigned)UseOpIdx)}, *DefMI)) {
+ FoldAsLoadDefReg = 0;
+ return FoldMI;
+ }
+
+ return nullptr;
}
bool SystemZInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
@@ -955,8 +964,9 @@ static LogicOp interpretAndImmediate(unsigned Opcode) {
}
static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) {
- if (OldMI->registerDefIsDead(SystemZ::CC)) {
- MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC);
+ if (OldMI->registerDefIsDead(SystemZ::CC, /*TRI=*/nullptr)) {
+ MachineOperand *CCDef =
+ NewMI->findRegisterDefOperand(SystemZ::CC, /*TRI=*/nullptr);
if (CCDef != nullptr)
CCDef->setIsDead(true);
}
@@ -1453,8 +1463,8 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
assert(LoadMI != InsertPt && "Assuming InsertPt not to be first in MBB.");
for (MachineBasicBlock::iterator MII = std::prev(InsertPt);;
--MII) {
- if (MII->definesRegister(SystemZ::CC)) {
- if (!MII->registerDefIsDead(SystemZ::CC))
+ if (MII->definesRegister(SystemZ::CC, /*TRI=*/nullptr)) {
+ if (!MII->registerDefIsDead(SystemZ::CC, /*TRI=*/nullptr))
return nullptr;
break;
}
@@ -1467,9 +1477,8 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
}
Register FoldAsLoadDefReg = LoadMI.getOperand(0).getReg();
- // We don't really need Ops, but do a sanity check:
- assert(Ops.size() == 1 && FoldAsLoadDefReg == MI.getOperand(Ops[0]).getReg() &&
- "Expected MI to have the only use of the load.");
+ if (Ops.size() != 1 || FoldAsLoadDefReg != MI.getOperand(Ops[0]).getReg())
+ return nullptr;
Register DstReg = MI.getOperand(0).getReg();
MachineOperand LHS = MI.getOperand(1);
MachineOperand RHS = MI.getOperand(2);
More information about the llvm-commits
mailing list