[llvm] [SystemZ] Add a SystemZ specific pre-RA scheduling strategy. (PR #135076)
Jonas Paulsson via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 24 14:00:34 PDT 2025
https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/135076
>From 6c5d72cdcd1ef88d9d187c5b6fd92507f92dfba3 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson at linux.vnet.ibm.com>
Date: Mon, 6 Feb 2023 12:22:48 +0100
Subject: [PATCH 1/2] REBASED Final cleanup. MISched0, with -misched-gprloads.
Simplified and refined. Remove previous versions. -tiny-region Disable pre-ra
scheduling. Revert "Disable pre-ra scheduling." Try TinyRegion=10 without
extra checks. Revert "Try TinyRegion=10 without extra checks." Try
TinyRegion=10 with some exclusions. Tiny10Lat without chainpreds and hazard
Was c0f4354
---
llvm/include/llvm/CodeGen/MachineScheduler.h | 2 +
llvm/lib/CodeGen/MachineScheduler.cpp | 51 +--
.../lib/Target/SystemZ/SystemZElimCompare.cpp | 42 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 28 ++
llvm/lib/Target/SystemZ/SystemZInstrInfo.h | 11 +
.../SystemZ/SystemZMachineScheduler.cpp | 415 +++++++++++++++++-
.../Target/SystemZ/SystemZMachineScheduler.h | 75 +++-
.../Target/SystemZ/SystemZTargetMachine.cpp | 19 +
.../lib/Target/SystemZ/SystemZTargetMachine.h | 4 +
.../CodeGen/SystemZ/DAGCombiner_isAlias.ll | 8 +-
llvm/test/CodeGen/SystemZ/alias-01.ll | 4 +-
llvm/test/CodeGen/SystemZ/args-06.ll | 8 +-
llvm/test/CodeGen/SystemZ/args-12.ll | 2 +-
llvm/test/CodeGen/SystemZ/atomic-load-09.ll | 4 +-
llvm/test/CodeGen/SystemZ/atomic-store-08.ll | 8 +-
llvm/test/CodeGen/SystemZ/atomic-store-09.ll | 4 +-
.../test/CodeGen/SystemZ/atomicrmw-fmax-01.ll | 8 +-
.../test/CodeGen/SystemZ/atomicrmw-fmin-01.ll | 8 +-
.../CodeGen/SystemZ/atomicrmw-ops-i128.ll | 6 +-
llvm/test/CodeGen/SystemZ/bswap-09.ll | 10 +-
llvm/test/CodeGen/SystemZ/bswap-10.ll | 10 +-
llvm/test/CodeGen/SystemZ/call-zos-vec.ll | 18 +-
llvm/test/CodeGen/SystemZ/dag-combine-05.ll | 20 +-
...-asm-fp-int-casting-explicit-regs-zEC12.ll | 8 +-
.../inline-asm-fp-int-casting-zEC12.ll | 8 +-
llvm/test/CodeGen/SystemZ/int-conv-14.ll | 12 +-
llvm/test/CodeGen/SystemZ/int-mul-12.ll | 27 +-
llvm/test/CodeGen/SystemZ/int-mul-13.ll | 8 +-
llvm/test/CodeGen/SystemZ/int-mul-15.ll | 10 +-
llvm/test/CodeGen/SystemZ/int-uadd-14.ll | 38 +-
llvm/test/CodeGen/SystemZ/int-usub-13.ll | 50 +--
.../SystemZ/machine-combiner-reassoc-fp.ll | 192 ++++----
llvm/test/CodeGen/SystemZ/memcpy-01.ll | 4 +-
llvm/test/CodeGen/SystemZ/memset-01.ll | 4 +-
.../SystemZ/misched-prera-biaspregs.mir | 87 ++++
.../SystemZ/misched-prera-copy-coal.mir | 31 ++
.../SystemZ/misched-prera-latencies.mir | 167 +++++++
.../CodeGen/SystemZ/misched-prera-loads.mir | 391 +++++++++++++++++
.../SystemZ/misched-prera-manystores-01.ll | 31 ++
.../SystemZ/misched-prera-manystores-02.mir | 200 +++++++++
.../SystemZ/misched-prera-manystores-03.mir | 154 +++++++
.../SystemZ/misched-prera-tinyregions.mir | 160 +++++++
.../SystemZ/regcoal_remat_empty_subrange.ll | 6 +-
llvm/test/CodeGen/SystemZ/rot-03.ll | 22 +-
llvm/test/CodeGen/SystemZ/shift-13.ll | 48 +-
llvm/test/CodeGen/SystemZ/shift-14.ll | 48 +-
llvm/test/CodeGen/SystemZ/shift-15.ll | 48 +-
llvm/test/CodeGen/SystemZ/shift-16.ll | 86 ++--
llvm/test/CodeGen/SystemZ/shift-17.ll | 100 ++---
.../SystemZ/store_nonbytesized_vecs.ll | 80 ++--
llvm/test/CodeGen/SystemZ/vec-args-04.ll | 16 +-
.../SystemZ/vec-cmp-cmp-logic-select.ll | 302 ++++++-------
.../{vec-cmpsel.ll => vec-cmpsel-01.ll} | 199 ++++-----
llvm/test/CodeGen/SystemZ/vec-cmpsel-02.ll | 70 +++
llvm/test/CodeGen/SystemZ/vec-eval.ll | 14 +-
llvm/test/CodeGen/SystemZ/vec-move-23.ll | 250 +++++++----
llvm/test/CodeGen/SystemZ/vec-sub-01.ll | 20 +-
.../vector-constrained-fp-intrinsics.ll | 198 ++++-----
58 files changed, 2860 insertions(+), 994 deletions(-)
create mode 100644 llvm/test/CodeGen/SystemZ/misched-prera-biaspregs.mir
create mode 100644 llvm/test/CodeGen/SystemZ/misched-prera-copy-coal.mir
create mode 100644 llvm/test/CodeGen/SystemZ/misched-prera-latencies.mir
create mode 100644 llvm/test/CodeGen/SystemZ/misched-prera-loads.mir
create mode 100644 llvm/test/CodeGen/SystemZ/misched-prera-manystores-01.ll
create mode 100644 llvm/test/CodeGen/SystemZ/misched-prera-manystores-02.mir
create mode 100644 llvm/test/CodeGen/SystemZ/misched-prera-manystores-03.mir
create mode 100644 llvm/test/CodeGen/SystemZ/misched-prera-tinyregions.mir
rename llvm/test/CodeGen/SystemZ/{vec-cmpsel.ll => vec-cmpsel-01.ll} (78%)
create mode 100644 llvm/test/CodeGen/SystemZ/vec-cmpsel-02.ll
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index bc00d0b4ff852..8a819051f069a 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1087,6 +1087,7 @@ class GenericSchedulerBase : public MachineSchedStrategy {
NoCand,
Only1,
PhysReg,
+ LivenessReduce,
RegExcess,
RegCritical,
Stall,
@@ -1218,6 +1219,7 @@ class GenericSchedulerBase : public MachineSchedStrategy {
};
// Utility functions used by heuristics in tryCandidate().
+unsigned computeRemLatency(SchedBoundary &CurrZone);
bool tryLess(int TryVal, int CandVal,
GenericSchedulerBase::SchedCandidate &TryCand,
GenericSchedulerBase::SchedCandidate &Cand,
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 0c3ffb1bbaa6f..63eea3e701bc2 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -3168,31 +3168,6 @@ initResourceDelta(const ScheduleDAGMI *DAG,
}
}
-/// Compute remaining latency. We need this both to determine whether the
-/// overall schedule has become latency-limited and whether the instructions
-/// outside this zone are resource or latency limited.
-///
-/// The "dependent" latency is updated incrementally during scheduling as the
-/// max height/depth of scheduled nodes minus the cycles since it was
-/// scheduled:
-/// DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
-///
-/// The "independent" latency is the max ready queue depth:
-/// ILat = max N.depth for N in Available|Pending
-///
-/// RemainingLatency is the greater of independent and dependent latency.
-///
-/// These computations are expensive, especially in DAGs with many edges, so
-/// only do them if necessary.
-static unsigned computeRemLatency(SchedBoundary &CurrZone) {
- unsigned RemLatency = CurrZone.getDependentLatency();
- RemLatency = std::max(RemLatency,
- CurrZone.findMaxLatency(CurrZone.Available.elements()));
- RemLatency = std::max(RemLatency,
- CurrZone.findMaxLatency(CurrZone.Pending.elements()));
- return RemLatency;
-}
-
/// Returns true if the current cycle plus remaning latency is greater than
/// the critical path in the scheduling region.
bool GenericSchedulerBase::shouldReduceLatency(const CandPolicy &Policy,
@@ -3278,6 +3253,7 @@ const char *GenericSchedulerBase::getReasonStr(
case NoCand: return "NOCAND ";
case Only1: return "ONLY1 ";
case PhysReg: return "PHYS-REG ";
+ case LivenessReduce: return "LIVE-REDUC";
case RegExcess: return "REG-EXCESS";
case RegCritical: return "REG-CRIT ";
case Stall: return "STALL ";
@@ -3351,6 +3327,31 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
#endif
namespace llvm {
+/// Compute remaining latency. We need this both to determine whether the
+/// overall schedule has become latency-limited and whether the instructions
+/// outside this zone are resource or latency limited.
+///
+/// The "dependent" latency is updated incrementally during scheduling as the
+/// max height/depth of scheduled nodes minus the cycles since it was
+/// scheduled:
+/// DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
+///
+/// The "independent" latency is the max ready queue depth:
+/// ILat = max N.depth for N in Available|Pending
+///
+/// RemainingLatency is the greater of independent and dependent latency.
+///
+/// These computations are expensive, especially in DAGs with many edges, so
+/// only do them if necessary.
+unsigned computeRemLatency(SchedBoundary &CurrZone) {
+ unsigned RemLatency = CurrZone.getDependentLatency();
+ RemLatency = std::max(RemLatency,
+ CurrZone.findMaxLatency(CurrZone.Available.elements()));
+ RemLatency = std::max(RemLatency,
+ CurrZone.findMaxLatency(CurrZone.Pending.elements()));
+ return RemLatency;
+}
+
/// Return true if this heuristic determines order.
/// TODO: Consider refactor return type of these functions as integer or enum,
/// as we may need to differentiate whether TryCand is better than Cand.
diff --git a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
index 81f0014dd83f2..149a7b2f451d5 100644
--- a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -151,30 +151,6 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr &MI, unsigned Reg) {
return Ref;
}
-// Return true if this is a load and test which can be optimized the
-// same way as compare instruction.
-static bool isLoadAndTestAsCmp(MachineInstr &MI) {
- // If we during isel used a load-and-test as a compare with 0, the
- // def operand is dead.
- return (MI.getOpcode() == SystemZ::LTEBR ||
- MI.getOpcode() == SystemZ::LTDBR ||
- MI.getOpcode() == SystemZ::LTXBR) &&
- MI.getOperand(0).isDead();
-}
-
-// Return the source register of Compare, which is the unknown value
-// being tested.
-static unsigned getCompareSourceReg(MachineInstr &Compare) {
- unsigned reg = 0;
- if (Compare.isCompare())
- reg = Compare.getOperand(0).getReg();
- else if (isLoadAndTestAsCmp(Compare))
- reg = Compare.getOperand(1).getReg();
- assert(reg);
-
- return reg;
-}
-
// Compare compares the result of MI against zero. If MI is an addition
// of -1 and if CCUsers is a single branch on nonzero, eliminate the addition
// and convert the branch to a BRCT(G) or BRCTH. Return true on success.
@@ -207,7 +183,7 @@ bool SystemZElimCompare::convertToBRCT(
// We already know that there are no references to the register between
// MI and Compare. Make sure that there are also no references between
// Compare and Branch.
- unsigned SrcReg = getCompareSourceReg(Compare);
+ unsigned SrcReg = TII->getCompareSourceReg(Compare);
MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
for (++MBBI; MBBI != MBBE; ++MBBI)
if (getRegReferences(*MBBI, SrcReg))
@@ -254,7 +230,7 @@ bool SystemZElimCompare::convertToLoadAndTrap(
// We already know that there are no references to the register between
// MI and Compare. Make sure that there are also no references between
// Compare and Branch.
- unsigned SrcReg = getCompareSourceReg(Compare);
+ unsigned SrcReg = TII->getCompareSourceReg(Compare);
MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
for (++MBBI; MBBI != MBBE; ++MBBI)
if (getRegReferences(*MBBI, SrcReg))
@@ -495,25 +471,17 @@ bool SystemZElimCompare::adjustCCMasksForInstr(
return true;
}
-// Return true if Compare is a comparison against zero.
-static bool isCompareZero(MachineInstr &Compare) {
- if (isLoadAndTestAsCmp(Compare))
- return true;
- return Compare.getNumExplicitOperands() == 2 &&
- Compare.getOperand(1).isImm() && Compare.getOperand(1).getImm() == 0;
-}
-
// Try to optimize cases where comparison instruction Compare is testing
// a value against zero. Return true on success and if Compare should be
// deleted as dead. CCUsers is the list of instructions that use the CC
// value produced by Compare.
bool SystemZElimCompare::optimizeCompareZero(
MachineInstr &Compare, SmallVectorImpl<MachineInstr *> &CCUsers) {
- if (!isCompareZero(Compare))
+ if (!TII->isCompareZero(Compare))
return false;
// Search back for CC results that are based on the first operand.
- unsigned SrcReg = getCompareSourceReg(Compare);
+ unsigned SrcReg = TII->getCompareSourceReg(Compare);
MachineBasicBlock &MBB = *Compare.getParent();
Reference CCRefs;
Reference SrcRefs;
@@ -702,7 +670,7 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) {
MachineBasicBlock::iterator MBBI = MBB.end();
while (MBBI != MBB.begin()) {
MachineInstr &MI = *--MBBI;
- if (CompleteCCUsers && (MI.isCompare() || isLoadAndTestAsCmp(MI)) &&
+ if (CompleteCCUsers && (MI.isCompare() || TII->isLoadAndTestAsCmp(MI)) &&
(optimizeCompareZero(MI, CCUsers) ||
fuseCompareOperations(MI, CCUsers))) {
++MBBI;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index ae6ca55a36092..0e10cdab714f0 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -2156,6 +2156,34 @@ unsigned SystemZInstrInfo::getFusedCompare(unsigned Opcode,
return 0;
}
+bool SystemZInstrInfo::isLoadAndTestAsCmp(const MachineInstr &MI) const {
+ // If we during isel used a load-and-test as a compare with 0, the
+ // def operand is dead.
+ return (MI.getOpcode() == SystemZ::LTEBR ||
+ MI.getOpcode() == SystemZ::LTDBR ||
+ MI.getOpcode() == SystemZ::LTXBR) &&
+ MI.getOperand(0).isDead();
+}
+
+bool SystemZInstrInfo::isCompareZero(const MachineInstr &Compare) const {
+ if (isLoadAndTestAsCmp(Compare))
+ return true;
+ return Compare.isCompare() && Compare.getNumExplicitOperands() == 2 &&
+ Compare.getOperand(1).isImm() && Compare.getOperand(1).getImm() == 0;
+}
+
+unsigned SystemZInstrInfo::
+getCompareSourceReg(const MachineInstr &Compare) const {
+ unsigned reg = 0;
+ if (Compare.isCompare())
+ reg = Compare.getOperand(0).getReg();
+ else if (isLoadAndTestAsCmp(Compare))
+ reg = Compare.getOperand(1).getReg();
+ assert(reg);
+
+ return reg;
+}
+
bool SystemZInstrInfo::
prepareCompareSwapOperands(MachineBasicBlock::iterator const MBBI) const {
assert(MBBI->isCompare() && MBBI->getOperand(0).isReg() &&
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 8b82af61e669a..2030d52becc0e 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -356,6 +356,17 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
SystemZII::FusedCompareType Type,
const MachineInstr *MI = nullptr) const;
+ // Return true if this is a load and test which can be optimized the
+ // same way as compare instruction.
+ bool isLoadAndTestAsCmp(const MachineInstr &MI) const;
+
+ // Return true if Compare is a comparison against zero.
+ bool isCompareZero(const MachineInstr &Compare) const;
+
+ // Return the source register of Compare, which is the unknown value
+ // being tested.
+ unsigned getCompareSourceReg(const MachineInstr &Compare) const;
+
// Try to find all CC users of the compare instruction (MBBI) and update
// all of them to maintain equivalent behavior after swapping the compare
// operands. Return false if not all users can be conclusively found and
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 5e2365f1dc513..85376ec70edc5 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -5,22 +5,421 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-//
-// -------------------------- Post RA scheduling ---------------------------- //
-// SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
-// the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
-// implementation that looks to optimize decoder grouping and balance the
-// usage of processor resources. Scheduler states are saved for the end
-// region of each MBB, so that a successor block can learn from it.
-//===----------------------------------------------------------------------===//
#include "SystemZMachineScheduler.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
using namespace llvm;
#define DEBUG_TYPE "machine-scheduler"
+/// Pre-RA scheduling ///
+
+static cl::opt<unsigned> TinyRegionLim(
+ "tiny-region-lim", cl::Hidden, cl::init(10),
+ cl::desc("Run limited pre-ra scheduling on regions of this size or "
+ "smaller. Mainly for testing."));
+
+static bool isRegDef(const MachineOperand &MO) {
+ return MO.isReg() && MO.isDef();
+}
+
+static bool isVirtRegDef(const MachineOperand &MO) {
+ return isRegDef(MO) && MO.getReg().isVirtual();
+}
+
+static bool isPhysRegDef(const MachineOperand &MO) {
+ return isRegDef(MO) && MO.getReg().isPhysical();
+}
+
+static bool isVirtRegUse(const MachineOperand &MO) {
+ return MO.isReg() && MO.isUse() && MO.readsReg() && MO.getReg().isVirtual();
+}
+
+void SystemZPreRASchedStrategy::initializePrioRegClasses(
+ const TargetRegisterInfo *TRI) {
+ for (const TargetRegisterClass *RC : TRI->regclasses()) {
+ for (MVT VT : MVT::fp_valuetypes())
+ if (TRI->isTypeLegalForClass(*RC, VT)) {
+ PrioRegClasses.insert(RC->getID());
+ break;
+ }
+
+ // On SystemZ vector and FP registers overlap: add any vector RC.
+ if (!PrioRegClasses.count(RC->getID()))
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+ if (TRI->isTypeLegalForClass(*RC, VT)) {
+ PrioRegClasses.insert(RC->getID());
+ break;
+ }
+ }
+}
+
+void SystemZPreRASchedStrategy::VRegSet::dump(std::string Msg) {
+ dbgs() << Msg.c_str();
+ bool First = true;
+ for (auto R : *this) {
+ if (!First)
+ dbgs() << ", ";
+ else
+ First = false;
+ dbgs() << "%" << R.virtRegIndex();
+ }
+ dbgs() << "\n";
+}
+
+unsigned SystemZPreRASchedStrategy::getRemLat(SchedBoundary *Zone) const {
+ if (RemLat == ~0U)
+ RemLat = computeRemLatency(*Zone);
+ return RemLat;
+}
+
+void SystemZPreRASchedStrategy::initializeStoresGroup() {
+ StoresGroup.clear();
+ FirstStoreInGroupScheduled = false;
+
+ unsigned CurrMaxDepth = 0;
+ for (unsigned Idx = DAG->SUnits.size() - 1; Idx + 1 != 0; --Idx) {
+ const SUnit *SU = &DAG->SUnits[Idx];
+ const MachineInstr *MI = SU->getInstr();
+ if (!MI->getNumOperands() || MI->isCopy())
+ continue;
+
+ bool HasVirtDef = false;
+ bool HasVirtUse = false;
+ for (unsigned I = 0; I < MI->getDesc().getNumOperands(); ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
+ if (isVirtRegDef(MO) && !MO.isDead())
+ HasVirtDef = true;
+ else if (isVirtRegUse(MO) &&
+ MI->getDesc().operands()[I].OperandType != MCOI::OPERAND_MEMORY)
+ HasVirtUse = true;
+ }
+ bool IsStore = !HasVirtDef && HasVirtUse;
+
+ // Find a group of stores that all are at the bottom while avoiding
+ // regions with any additional group of lesser depth.
+ if (SU->getDepth() > CurrMaxDepth) {
+ CurrMaxDepth = SU->getDepth();
+ bool PrevGroup = StoresGroup.size() > 1;
+ StoresGroup.clear();
+ if (PrevGroup)
+ return;
+ if (IsStore)
+ StoresGroup.insert(SU);
+ }
+ else if (IsStore && !StoresGroup.empty() && SU->getDepth() == CurrMaxDepth) {
+ // The group members should all have the same opcode.
+ if ((*StoresGroup.begin())->getInstr()->getOpcode() != MI->getOpcode()) {
+ StoresGroup.clear();
+ return;
+ }
+ StoresGroup.insert(SU);
+ }
+ }
+
+ // Value of 8 handles a known regression (with group of 20).
+ // TODO: Would some other value be better?
+ if (StoresGroup.size() < 8)
+ StoresGroup.clear();
+}
+
+static int biasPhysRegExtra(const SUnit *SU) {
+ if (int Res = biasPhysReg(SU, /*isTop=*/false))
+ return Res;
+
+ // Also recognize Load Address of stack slot. There are (at least
+ // currently) no instructions here defining a physreg that uses a vreg.
+ const MachineInstr *MI = SU->getInstr();
+ if (MI->getNumOperands() && !MI->isCopy()) {
+ const MachineOperand &DefMO = MI->getOperand(0);
+ if (isPhysRegDef(DefMO))
+ return 1;
+ }
+
+ return 0;
+}
+
+int SystemZPreRASchedStrategy::
+computeSULivenessScore(SchedCandidate &C, ScheduleDAGMILive *DAG,
+ SchedBoundary *Zone) const {
+ // Not all data deps are modelled around the SUnit - some data edges near
+ // boundaries are missing: Look directly at the MI operands instead.
+ const SUnit *SU = C.SU;
+ const MachineInstr *MI = SU->getInstr();
+ if (!MI->getNumOperands() || MI->isCopy())
+ return 0;
+
+ // Find uses of registers that are not already live (kills).
+ bool PrioKill = false;
+ bool GPRKill = false;
+ bool AddrKill = false;
+ bool HasPrioUse = false;
+ for (unsigned I = 0; I < MI->getDesc().getNumOperands(); ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
+ if (!isVirtRegUse(MO))
+ continue;
+ HasPrioUse |= isPrioVirtReg(MO.getReg(), &DAG->MRI);
+ if (LiveRegs.count(MO.getReg()))
+ continue;
+ if (isPrioVirtReg(MO.getReg(), &DAG->MRI))
+ PrioKill = true;
+ else if (MI->getDesc().operands()[I].OperandType != MCOI::OPERAND_MEMORY)
+ GPRKill = true;
+ else
+ AddrKill = true;
+ }
+
+ // Find the interesting properties.
+ const MachineOperand &DefMO = MI->getOperand(0);
+ assert(!isPhysRegDef(DefMO) && "Did not expect physreg def!");
+ bool IsLoad =
+ isRegDef(DefMO) && !DefMO.isDead() && !IsRedefining[SU->NodeNum];
+ bool IsStore = (!isRegDef(DefMO) || DefMO.isDead());
+ // Prioritize FP: Ignore GPR/Addr kills with an FP def.
+ bool UsesLivePrio =
+ IsLoad && !PrioKill &&
+ (isPrioVirtReg(DefMO.getReg(), &DAG->MRI) || (!GPRKill && !AddrKill));
+ bool UsesLiveAll = !PrioKill && !GPRKill && !AddrKill;
+ bool PreservesSchedLat = SU->getHeight() <= Zone->getScheduledLatency();
+ const unsigned Cycles = 2;
+ unsigned Margin = SchedModel->getIssueWidth() * (Cycles + SU->Latency - 1);
+ bool HasDistToTop = NumLeft > Margin;
+
+ // Pull down a defining SU if it preserves the scheduled latency while not
+ // causing any (prioritized) register uses to become live. If however there
+ // will be relatively many SUs scheduled above this one and all uses are
+ // already live it should not be a problem to increase the scheduled
+ // latency given the OOO execution.
+ // TODO: Try schedulling small (DFSResult) subtrees as a unit.
+ bool SchedLow = IsLoad && ((PreservesSchedLat && UsesLivePrio) ||
+ (HasDistToTop && UsesLiveAll));
+
+ // This handles regions with many chained stores of the same depth at the
+ // bottom in the input order (cactus). Push them upwards during scheduling.
+ bool SchedHigh = IsStore && FirstStoreInGroupScheduled &&
+ StoresGroup.count(SU) &&
+ (PrioKill || (!HasPrioUse && GPRKill));
+
+ if (SchedLow)
+ return -1;
+ if (SchedHigh)
+ return 1;
+ return 0;
+}
+
+bool SystemZPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ assert(Zone && !Zone->isTop() && "Bottom-Up scheduling only.");
+
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = FirstValid;
+ return true;
+ }
+
+ // Bias physreg defs and copies to their uses and definitions respectively.
+ int TryCandPRegBias = biasPhysRegExtra(TryCand.SU);
+ int CandPRegBias = biasPhysRegExtra(Cand.SU);
+ if (tryGreater(TryCandPRegBias, CandPRegBias, TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+ if (TryCandPRegBias && CandPRegBias) {
+ // Both biased same way.
+ tryGreater(TryCand.SU->NodeNum, Cand.SU->NodeNum, TryCand, Cand, NodeOrder);
+ return TryCand.Reason != NoCand;
+ }
+
+ if (TinyRegion) {
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ // TODO: Try this in bigger regions as well.
+ if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+ Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return TryCand.Reason != NoCand;
+ } else {
+ // Look for an opportunity to reduce register liveness.
+ int TryCandScore = computeSULivenessScore(TryCand, DAG, Zone);
+ int CandScore = computeSULivenessScore(Cand, DAG, Zone);
+ if (tryLess(TryCandScore, CandScore, TryCand, Cand, LivenessReduce))
+ return TryCand.Reason != NoCand;
+
+ // Don't extend the scheduled latency.
+ if (ShouldReduceLatency && TryCand.SU->getHeight() != Cand.SU->getHeight() &&
+ (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
+ Zone->getScheduledLatency())) {
+ unsigned HigherSUDepth = TryCand.SU->getHeight() < Cand.SU->getHeight() ?
+ Cand.SU->getDepth() : TryCand.SU->getDepth();
+ if (HigherSUDepth != getRemLat(Zone) &&
+ tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+ TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) {
+ return TryCand.Reason != NoCand;
+ }
+ }
+ }
+
+ // Weak edges are for clustering and other constraints.
+ if (tryLess(TryCand.SU->WeakSuccsLeft, Cand.SU->WeakSuccsLeft,
+ TryCand, Cand, Weak))
+ return TryCand.Reason != NoCand;
+
+ // Fall through to original instruction order.
+ if (TryCand.SU->NodeNum > Cand.SU->NodeNum) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ return false;
+}
+
+void SystemZPreRASchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) {
+ // Keep track of live regs instead of using the generic reg pressure tracking.
+ RegionPolicy.ShouldTrackPressure = false;
+ // These heuristics has so far seemed to work better without adding a
+ // top-down boundary.
+ RegionPolicy.OnlyBottomUp = true;
+}
+
+void SystemZPreRASchedStrategy::initialize(ScheduleDAGMI *dag) {
+ GenericScheduler::initialize(dag);
+
+ TinyRegion = DAG->SUnits.size() <= TinyRegionLim;
+ const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(DAG->TII);
+ if (TinyRegion) {
+ // A tiny region with long latency instructions is better handled using
+ // normal heuristics, except in regions that have COPYs of a physreg both
+ // ways and/or have a compare-0 likely to be eliminated.
+ const SUnit *CmpZeroSU = nullptr;
+ const SUnit *CmpSrcSU = nullptr;
+ Register CmpSrcReg = 0;
+ bool OtherCCClob = false;
+ unsigned MaxLat = 0;
+ std::set<Register> PRegs;
+ bool CopysPRegDep = false;
+ for (unsigned Idx = DAG->SUnits.size() - 1; Idx + 1 != 0; --Idx) {
+ const SUnit *SU = &DAG->SUnits[Idx];
+ const MachineInstr *MI = SU->getInstr();
+
+ // Check for a (likely) eliminable compare-0.
+ if (TII->isCompareZero(*MI)) {
+ CmpZeroSU = SU;
+ CmpSrcReg = TII->getCompareSourceReg(*MI);
+ continue;
+ }
+ if (MI->getNumOperands()) {
+ const MachineOperand &DefMO = MI->getOperand(0);
+ // Doing this instead of SU data preds happens to also handle the
+ // case where CmpSrcReg is redefined.
+ if (isVirtRegDef(DefMO) && DefMO.getReg() == CmpSrcReg &&
+ MI->getDesc().hasImplicitDefOfPhysReg(SystemZ::CC))
+ CmpSrcSU = SU;
+ }
+ if (SU != CmpZeroSU && SU != CmpSrcSU &&
+ MI->getDesc().hasImplicitDefOfPhysReg(SystemZ::CC))
+ OtherCCClob = true;
+
+ // Check for long latency instructions.
+ MaxLat = std::max(MaxLat, unsigned(SU->Latency));
+
+ // Check for COPYs of pregs both in and out of the region.
+ if (MI->isCopy()) {
+ Register DstReg = MI->getOperand(0).getReg();
+ Register SrcReg = MI->getOperand(1).getReg();
+ if (DstReg.isPhysical() && DAG->MRI.isAllocatable(DstReg) &&
+ SrcReg.isVirtual())
+ PRegs.insert(DstReg);
+ else if (SrcReg.isPhysical() && DAG->MRI.isAllocatable(SrcReg) &&
+ DstReg.isVirtual()) {
+ if (!PRegs.insert(SrcReg).second)
+ CopysPRegDep = true;
+ }
+ }
+ }
+ bool CmpElimRegion = CmpZeroSU && CmpSrcSU && OtherCCClob;
+
+ if (DAG->SUnits.size() > 6 && MaxLat >= 6 && !CopysPRegDep &&
+ !CmpElimRegion)
+ TinyRegion = false;
+ }
+ LLVM_DEBUG(dbgs() << "Region is" << (TinyRegion ? "" : " not") << " tiny.\n");
+ if (TinyRegion)
+ return;
+
+ NumLeft = DAG->SUnits.size();
+ RemLat = ~0U;
+
+ // It seems to work best to include the latencies in this heuristic (as
+ // opposed to something like a "unit SU height" with all latencies counted
+ // as 1).
+ unsigned DAGHeight = 0;
+ for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx)
+ DAGHeight = std::max(DAGHeight, DAG->SUnits[Idx].getHeight());
+ ShouldReduceLatency = DAG->SUnits.size() < 3 * std::max(DAGHeight, 1u);
+ LLVM_DEBUG(if (ShouldReduceLatency) dbgs() << "Latency scheduling enabled.\n";
+ else dbgs() << "Latency scheduling disabled.\n";);
+
+ // Find the registers that are live at the bottom, before scheduling.
+ LiveRegs.clear();
+ for (unsigned I = 0, E = DAG->MRI.getNumVirtRegs(); I != E; ++I) {
+ Register VirtReg = Register::index2VirtReg(I);
+ const LiveInterval &LI = DAG->getLIS()->getInterval(VirtReg);
+ LiveQueryResult LRQ = LI.Query(
+ DAG->getLIS()->getInstructionIndex(*DAG->SUnits.back().getInstr()));
+ if (LRQ.valueOut())
+ LiveRegs.insert(VirtReg);
+ }
+ LLVM_DEBUG(LiveRegs.dump("Live out at bottom: "););
+
+ // If MI uses the register it defines, record it one time here.
+ IsRedefining = std::vector<bool>(DAG->SUnits.size(), false);
+ for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+ const MachineInstr *MI = DAG->SUnits[Idx].getInstr();
+ if (MI->getNumOperands()) {
+ const MachineOperand &DefMO = MI->getOperand(0);
+ if (isVirtRegDef(DefMO))
+ IsRedefining[Idx] = MI->readsVirtualRegister(DefMO.getReg());
+ }
+ }
+
+ initializeStoresGroup();
+ LLVM_DEBUG(if (!StoresGroup.empty()) dbgs()
+ << "Has StoresGroup of " << StoresGroup.size() << " stores.\n";
+ else dbgs() << "No StoresGroup.\n";);
+}
+
+void SystemZPreRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+ GenericScheduler::schedNode(SU, IsTopNode);
+ if (TinyRegion)
+ return;
+
+ LLVM_DEBUG(LiveRegs.dump("Live regs was: "););
+
+ if (!FirstStoreInGroupScheduled && StoresGroup.count(SU))
+ FirstStoreInGroupScheduled = true;
+
+ // Update LiveRegs.
+ MachineInstr *MI = SU->getInstr();
+ for (auto &MO : MI->explicit_operands())
+ if (MO.isReg() && MO.getReg().isVirtual()) {
+ if (MO.isDef()) {
+ // A subreg def may not be in LiveRegs if the use of it was implicit.
+ assert(LiveRegs.count(MO.getReg()) || MO.isDead() || MO.getSubReg());
+ if (!IsRedefining[SU->NodeNum])
+ LiveRegs.erase(MO.getReg());
+ } else if (MO.readsReg())
+ LiveRegs.insert(MO.getReg());
+ }
+
+ assert(NumLeft > 0);
+ --NumLeft;
+ RemLat = ~0U;
+}
+
+/// Post-RA scheduling ///
+
#ifndef NDEBUG
// Print the set of SUs
void SystemZPostRASchedStrategy::SUSet::
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
index 1ff6cc81503ce..6219e70839bc9 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -5,14 +5,24 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
+
+// -------------------------- Pre RA scheduling ----------------------------- //
+//
+// SystemZPreRASchedStrategy keeps track of currently live registers and first
+// tries to reduce live ranges by scheduling e.g. a load of a live register
+// immediately (bottom-up). It also aims to preserve the scheduled latency.
+// Small regions (up to 10 instructions) are mostly left alone as the input
+// order is usually then preferred.
//
// -------------------------- Post RA scheduling ---------------------------- //
+//
// SystemZPostRASchedStrategy is a scheduling strategy which is plugged into
// the MachineScheduler. It has a sorted Available set of SUs and a pickNode()
// implementation that looks to optimize decoder grouping and balance the
// usage of processor resources. Scheduler states are saved for the end
// region of each MBB, so that a successor block can learn from it.
-//===----------------------------------------------------------------------===//
+//
+//----------------------------------------------------------------------------//
#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
@@ -24,6 +34,69 @@
namespace llvm {
+/// A MachineSchedStrategy implementation for SystemZ pre RA scheduling.
+class SystemZPreRASchedStrategy : public GenericScheduler {
+ // The FP/Vector registers are prioritized during scheduling.
+ std::set<unsigned> PrioRegClasses;
+ void initializePrioRegClasses(const TargetRegisterInfo *TRI);
+ bool isPrioVirtReg(Register Reg, const MachineRegisterInfo *MRI) const {
+ return (Reg.isVirtual() &&
+ PrioRegClasses.count(MRI->getRegClass(Reg)->getID()));
+ }
+
+ // A TinyRegion has up to 10 instructions and is scheduled differently.
+ bool TinyRegion;
+
+ // Num instructions left to schedule.
+ unsigned NumLeft;
+
+ // Tru if latency scheduling is enabled.
+ bool ShouldReduceLatency;
+
+ // Keep track of currently live registers.
+ struct VRegSet : std::set<Register> {
+ void dump(std::string Msg);
+ size_type count(Register Reg) const {
+ assert(Reg.isVirtual());
+ return std::set<Register>::count(Reg);
+ }
+ } LiveRegs;
+
+ // True if MI is also using the register it defines.
+ std::vector<bool> IsRedefining;
+
+ // Only call computeRemLatency() once per scheduled node.
+ mutable unsigned RemLat;
+ unsigned getRemLat(SchedBoundary *Zone) const;
+
+ // A large group of stores at the bottom is spread upwards.
+ std::set<const SUnit*> StoresGroup;
+ bool FirstStoreInGroupScheduled;
+ void initializeStoresGroup();
+
+ // Compute the effect on register liveness by scheduling C next. An
+ // instruction that defines a live register without causing any other
+ // register to become live reduces liveness, while a store of a non-live
+ // register would increase it.
+ int computeSULivenessScore(SchedCandidate &C, ScheduleDAGMILive *DAG,
+ SchedBoundary *Zone) const;
+
+protected:
+ bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const override;
+
+public:
+ SystemZPreRASchedStrategy(const MachineSchedContext *C) : GenericScheduler(C) {
+ initializePrioRegClasses(C->MF->getRegInfo().getTargetRegisterInfo());
+ }
+
+ void initPolicy(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) override;
+ void initialize(ScheduleDAGMI *dag) override;
+ void schedNode(SUnit *SU, bool IsTopNode) override;
+};
+
/// A MachineSchedStrategy implementation for SystemZ post RA scheduling.
class SystemZPostRASchedStrategy : public MachineSchedStrategy {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 6300fb72990d0..70460c6197f29 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -35,6 +35,11 @@ static cl::opt<bool> EnableMachineCombinerPass(
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool> GenericSched(
+ "generic-sched", cl::Hidden, cl::init(false),
+ cl::desc("Run the generic pre-ra scheduler instead of the SystemZ "
+ "scheduler."));
+
// NOLINTNEXTLINE(readability-identifier-naming)
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
// Register the target.
@@ -206,6 +211,20 @@ SystemZTargetMachine::getSubtargetImpl(const Function &F) const {
return I.get();
}
+ScheduleDAGInstrs *
+SystemZTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
+ // Use GenericScheduler if requested on CL or for Z10, which has no sched
+ // model.
+ if (GenericSched ||
+ !C->MF->getSubtarget().getSchedModel().hasInstrSchedModel())
+ return nullptr;
+
+ ScheduleDAGMILive *DAG =
+ new ScheduleDAGMILive(C, std::make_unique<SystemZPreRASchedStrategy>(C));
+ DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
+ return DAG;
+}
+
ScheduleDAGInstrs *
SystemZTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
return new ScheduleDAGMI(C, std::make_unique<SystemZPostRASchedStrategy>(C),
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
index cced57a40ede0..1493332b9d361 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -55,6 +55,10 @@ class SystemZTargetMachine : public CodeGenTargetMachineImpl {
MachineFunctionInfo *
createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
const TargetSubtargetInfo *STI) const override;
+
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override;
+
ScheduleDAGInstrs *
createPostMachineScheduler(MachineSchedContext *C) const override;
diff --git a/llvm/test/CodeGen/SystemZ/DAGCombiner_isAlias.ll b/llvm/test/CodeGen/SystemZ/DAGCombiner_isAlias.ll
index 0e6c0e5836c04..2556c62ed1e72 100644
--- a/llvm/test/CodeGen/SystemZ/DAGCombiner_isAlias.ll
+++ b/llvm/test/CodeGen/SystemZ/DAGCombiner_isAlias.ll
@@ -10,10 +10,10 @@
; %.b = load i1, ptr @g_2, align 4
; CHECK: # %bb.6: # %crc32_gentab.exit
-; CHECK: larl %r2, g_2
-; CHECK-NEXT: llc %r3, 0(%r2)
-; CHECK-NOT: %r2
-; CHECK: llc %r1, 0(%r2)
+; CHECK: larl [[REG:%r[1-9]+]], g_2
+; CHECK-NEXT: llc {{%r[1-9]}}, 0([[REG]])
+; CHECK-NOT: [[REG]],
+; CHECK: llc %r1, 0([[REG]])
@g_2 = external hidden unnamed_addr global i1, align 4
@.str.1 = external hidden unnamed_addr constant [4 x i8], align 2
diff --git a/llvm/test/CodeGen/SystemZ/alias-01.ll b/llvm/test/CodeGen/SystemZ/alias-01.ll
index 008d659219172..83f721c0461ae 100644
--- a/llvm/test/CodeGen/SystemZ/alias-01.ll
+++ b/llvm/test/CodeGen/SystemZ/alias-01.ll
@@ -1,6 +1,6 @@
-; Test 32-bit ANDs in which the second operand is variable.
+; Test 32-bit ADDs in which the second operand is variable.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
; Check that there are no spills.
define void @f1(ptr %src1, ptr %dest) {
diff --git a/llvm/test/CodeGen/SystemZ/args-06.ll b/llvm/test/CodeGen/SystemZ/args-06.ll
index d19fdb58e5a16..45723ba9ab2e5 100644
--- a/llvm/test/CodeGen/SystemZ/args-06.ll
+++ b/llvm/test/CodeGen/SystemZ/args-06.ll
@@ -5,10 +5,10 @@
define i8 @f1(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g) {
; CHECK-LABEL: f1:
-; CHECK: lb {{%r[0-5]}}, 175(%r15)
-; CHECK: lb {{%r[0-5]}}, 167(%r15)
-; CHECK: ar %r2, %r3
-; CHECK: ar %r2, %r4
+; CHECK-DAG: lb {{%r[0-5]}}, 175(%r15)
+; CHECK-DAG: lb {{%r[0-5]}}, 167(%r15)
+; CHECK-DAG: ar %r2, %r3
+; CHECK-DAG: ar %r2, %r4
; CHECK: ar %r2, %r5
; CHECK: ar %r2, %r6
; CHECK: br %r14
diff --git a/llvm/test/CodeGen/SystemZ/args-12.ll b/llvm/test/CodeGen/SystemZ/args-12.ll
index 472672bbfd5ca..ada3fd3085a1c 100644
--- a/llvm/test/CodeGen/SystemZ/args-12.ll
+++ b/llvm/test/CodeGen/SystemZ/args-12.ll
@@ -28,11 +28,11 @@ define void @foo() {
; CHECK-NEXT: vl %v0, 0(%r1), 3
; CHECK-NEXT: vst %v0, 160(%r15), 3
; CHECK-NEXT: vgbm %v0, 0
-; CHECK-NEXT: la %r6, 216(%r15)
; CHECK-NEXT: lghi %r2, 1
; CHECK-NEXT: lghi %r3, 2
; CHECK-NEXT: lghi %r4, 3
; CHECK-NEXT: lghi %r5, 4
+; CHECK-NEXT: la %r6, 216(%r15)
; CHECK-NEXT: vst %v0, 200(%r15), 3
; CHECK-NEXT: vst %v0, 216(%r15), 3
; CHECK-NEXT: brasl %r14, bar at PLT
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-09.ll b/llvm/test/CodeGen/SystemZ/atomic-load-09.ll
index 61b8e2f0efa8c..74e5716b4169e 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-09.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-09.ll
@@ -39,8 +39,8 @@ define void @f2(ptr %ret, ptr %src) {
; CHECK-NEXT: aghi %r15, -176
; CHECK-NEXT: .cfi_def_cfa_offset 336
; CHECK-NEXT: lgr %r13, %r2
-; CHECK-NEXT: la %r4, 160(%r15)
; CHECK-NEXT: lghi %r2, 16
+; CHECK-NEXT: la %r4, 160(%r15)
; CHECK-NEXT: lhi %r5, 5
; CHECK-NEXT: brasl %r14, __atomic_load at PLT
; CHECK-NEXT: vl %v0, 160(%r15), 3
@@ -62,8 +62,8 @@ define void @f2_fpuse(ptr %ret, ptr %src) {
; CHECK-NEXT: aghi %r15, -176
; CHECK-NEXT: .cfi_def_cfa_offset 336
; CHECK-NEXT: lgr %r13, %r2
-; CHECK-NEXT: la %r4, 160(%r15)
; CHECK-NEXT: lghi %r2, 16
+; CHECK-NEXT: la %r4, 160(%r15)
; CHECK-NEXT: lhi %r5, 5
; CHECK-NEXT: brasl %r14, __atomic_load at PLT
; CHECK-NEXT: vl %v0, 160(%r15), 3
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
index 57f1319365c4f..d73ab946f72c7 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
@@ -86,11 +86,11 @@ define void @f2_fpuse(ptr %dst, ptr %src) {
; CHECK-NEXT: .cfi_def_cfa_offset 336
; CHECK-NEXT: ld %f0, 0(%r3)
; CHECK-NEXT: ld %f2, 8(%r3)
-; CHECK-DAG: lgr %r3, %r2
+; CHECK-DAG: lgr %r3, %r2
; CHECK-DAG: axbr %f0, %f0
-; CHECK-NEXT: la %r4, 160(%r15)
-; CHECK-NEXT: lghi %r2, 16
-; CHECK-NEXT: lhi %r5, 5
+; CHECK-DAG: la %r4, 160(%r15)
+; CHECK-DAG: lghi %r2, 16
+; CHECK-DAG: lhi %r5, 5
; CHECK-NEXT: std %f0, 160(%r15)
; CHECK-NEXT: std %f2, 168(%r15)
; CHECK-NEXT: brasl %r14, __atomic_store at PLT
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-09.ll b/llvm/test/CodeGen/SystemZ/atomic-store-09.ll
index 3af16490b34bd..b85e7f81790ec 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-09.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-09.ll
@@ -42,9 +42,9 @@ define void @f2(ptr %dst, ptr %src) {
; CHECK-NEXT: .cfi_def_cfa_offset 336
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: lgr %r0, %r2
-; CHECK-NEXT: la %r4, 160(%r15)
; CHECK-NEXT: lghi %r2, 16
; CHECK-NEXT: lgr %r3, %r0
+; CHECK-NEXT: la %r4, 160(%r15)
; CHECK-NEXT: lhi %r5, 5
; CHECK-NEXT: vst %v0, 160(%r15), 3
; CHECK-NEXT: brasl %r14, __atomic_store at PLT
@@ -66,9 +66,9 @@ define void @f2_fpuse(ptr %dst, ptr %src) {
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: wfaxb %v0, %v0, %v0
; CHECK-NEXT: lgr %r0, %r2
-; CHECK-NEXT: la %r4, 160(%r15)
; CHECK-NEXT: lghi %r2, 16
; CHECK-NEXT: lgr %r3, %r0
+; CHECK-NEXT: la %r4, 160(%r15)
; CHECK-NEXT: lhi %r5, 5
; CHECK-NEXT: vst %v0, 160(%r15), 3
; CHECK-NEXT: brasl %r14, __atomic_store at PLT
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-01.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-01.ll
index 80c43137e3a03..04b8e9f0d2c55 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-01.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-01.ll
@@ -12,10 +12,10 @@ define float @f1(ptr %src, float %b) {
; CHECK: ler %f0, [[FSRC]]
; CHECK: ler %f2, [[FB]]
; CHECK: brasl %r14, fmaxf at PLT
-; CHECK: lgdr [[RO:%r[0-9]+]], %f0
-; CHECK: srlg [[RO]], [[RO]], 32
-; CHECK: lgdr [[RI:%r[0-9]+]], [[FSRC]]
-; CHECK: srlg [[RI]], [[RI]], 32
+; CHECK-DAG: lgdr [[RO:%r[0-9]+]], %f0
+; CHECK-DAG: srlg [[RO]], [[RO]], 32
+; CHECK-DAG: lgdr [[RI:%r[0-9]+]], [[FSRC]]
+; CHECK-DAG: srlg [[RI]], [[RI]], 32
; CHECK: cs [[RI]], [[RO]], 0([[SRC]])
; CHECK: sllg [[RO]], [[RI]], 32
; CHECK: ldgr [[FSRC]], [[RO]]
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-01.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-01.ll
index c67b02e688de3..bb01b1d90eaa8 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-01.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-01.ll
@@ -12,10 +12,10 @@ define float @f1(ptr %src, float %b) {
; CHECK: ler %f0, [[FSRC]]
; CHECK: ler %f2, [[FB]]
; CHECK: brasl %r14, fminf at PLT
-; CHECK: lgdr [[RO:%r[0-9]+]], %f0
-; CHECK: srlg [[RO]], [[RO]], 32
-; CHECK: lgdr [[RI:%r[0-9]+]], [[FSRC]]
-; CHECK: srlg [[RI]], [[RI]], 32
+; CHECK-DAG: lgdr [[RO:%r[0-9]+]], %f0
+; CHECK-DAG: srlg [[RO]], [[RO]], 32
+; CHECK-DAG: lgdr [[RI:%r[0-9]+]], [[FSRC]]
+; CHECK-DAG: srlg [[RI]], [[RI]], 32
; CHECK: cs [[RI]], [[RO]], 0([[SRC]])
; CHECK: sllg [[RO]], [[RI]], 32
; CHECK: ldgr [[FSRC]], [[RO]]
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-ops-i128.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-ops-i128.ll
index c088f6d862e7c..beecfc1c36c96 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-ops-i128.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-ops-i128.ll
@@ -106,11 +106,11 @@ define i128 @atomicrmw_nand(ptr %src, i128 %b) {
; CHECK-NEXT: vl %v1, 0(%r3), 4
; CHECK-NEXT: .LBB4_1: # %atomicrmw.start
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vnn %v2, %v1, %v0
; CHECK-NEXT: vlgvg %r1, %v1, 1
; CHECK-NEXT: vlgvg %r0, %v1, 0
-; CHECK-NEXT: vnn %v1, %v1, %v0
-; CHECK-NEXT: vlgvg %r5, %v1, 1
-; CHECK-NEXT: vlgvg %r4, %v1, 0
+; CHECK-NEXT: vlgvg %r5, %v2, 1
+; CHECK-NEXT: vlgvg %r4, %v2, 0
; CHECK-NEXT: cdsg %r0, %r4, 0(%r3)
; CHECK-NEXT: vlvgp %v1, %r0, %r1
; CHECK-NEXT: jl .LBB4_1
diff --git a/llvm/test/CodeGen/SystemZ/bswap-09.ll b/llvm/test/CodeGen/SystemZ/bswap-09.ll
index a2d8273c89695..8fddcdbf8ed79 100644
--- a/llvm/test/CodeGen/SystemZ/bswap-09.ll
+++ b/llvm/test/CodeGen/SystemZ/bswap-09.ll
@@ -9,14 +9,14 @@ declare i128 @llvm.bswap.i128(i128 %a)
define i128 @f1(i128 %a, i128 %b, i128 %c) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v1, 0(%r4), 3
-; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
; CHECK-NEXT: larl %r1, .LCPI0_0
-; CHECK-NEXT: vaq %v1, %v2, %v1
; CHECK-NEXT: vl %v2, 0(%r1), 3
-; CHECK-NEXT: vl %v0, 0(%r5), 3
-; CHECK-NEXT: vperm %v1, %v1, %v1, %v2
+; CHECK-NEXT: vl %v3, 0(%r5), 3
; CHECK-NEXT: vaq %v0, %v1, %v0
+; CHECK-NEXT: vperm %v0, %v0, %v0, %v2
+; CHECK-NEXT: vaq %v0, %v0, %v3
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%in = add i128 %a, %b
diff --git a/llvm/test/CodeGen/SystemZ/bswap-10.ll b/llvm/test/CodeGen/SystemZ/bswap-10.ll
index 6de2970b80e2e..465c666808958 100644
--- a/llvm/test/CodeGen/SystemZ/bswap-10.ll
+++ b/llvm/test/CodeGen/SystemZ/bswap-10.ll
@@ -9,14 +9,14 @@ declare i128 @llvm.bswap.i128(i128 %a)
define i128 @f1(i128 %a, i128 %b, i128 %c) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v1, 0(%r4), 3
-; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
; CHECK-NEXT: larl %r1, .LCPI0_0
-; CHECK-NEXT: vaq %v1, %v2, %v1
; CHECK-NEXT: vl %v2, 0(%r1), 3
-; CHECK-NEXT: vl %v0, 0(%r5), 3
-; CHECK-NEXT: vperm %v1, %v1, %v1, %v2
+; CHECK-NEXT: vl %v3, 0(%r5), 3
; CHECK-NEXT: vaq %v0, %v1, %v0
+; CHECK-NEXT: vperm %v0, %v0, %v0, %v2
+; CHECK-NEXT: vaq %v0, %v0, %v3
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%in = add i128 %a, %b
diff --git a/llvm/test/CodeGen/SystemZ/call-zos-vec.ll b/llvm/test/CodeGen/SystemZ/call-zos-vec.ll
index 20bf2687c957e..32d29cb8ebc08 100644
--- a/llvm/test/CodeGen/SystemZ/call-zos-vec.ll
+++ b/llvm/test/CodeGen/SystemZ/call-zos-vec.ll
@@ -9,15 +9,15 @@ entry:
}
; CHECK-LABEL: sum_vecs1
-; CHECK: vaf 1,24,25
-; CHECK: vaf 1,1,26
-; CHECK: vaf 1,1,27
-; CHECK: vaf 1,1,28
-; CHECK: vaf 1,1,29
-; CHECK: vl 0,2304(4),4
-; CHECK: vaf 1,1,30
-; CHECK: vaf 1,1,31
-; CHECK: vaf 24,1,0
+; CHECK: vaf 0,24,25
+; CHECK: vaf 0,0,26
+; CHECK: vaf 0,0,27
+; CHECK: vaf 0,0,28
+; CHECK: vaf 0,0,29
+; CHECK: vl 1,2304(4),4
+; CHECK: vaf 0,0,30
+; CHECK: vaf 0,0,31
+; CHECK: vaf 24,0,1
define <4 x i32> @sum_vecs1(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4, <4 x i32> %v5, <4 x i32> %v6, <4 x i32> %v7, <4 x i32> %v8, <4 x i32> %v9) {
entry:
%add0 = add <4 x i32> %v1, %v2
diff --git a/llvm/test/CodeGen/SystemZ/dag-combine-05.ll b/llvm/test/CodeGen/SystemZ/dag-combine-05.ll
index 60528ad67d039..692d74250c1a0 100644
--- a/llvm/test/CodeGen/SystemZ/dag-combine-05.ll
+++ b/llvm/test/CodeGen/SystemZ/dag-combine-05.ll
@@ -9,18 +9,18 @@
define void @fun(i16 %arg0, ptr %src, ptr %dst) {
; CHECK-LABEL: fun:
; CHECK: # %bb.0: # %bb
-; CHECK-NEXT: llhr %r0, %r2
-; CHECK-NEXT: llh %r2, 0(%r3)
-; CHECK-NEXT: chi %r0, 9616
+; CHECK-NEXT: llh %r0, 0(%r3)
+; CHECK-NEXT: llhr %r1, %r2
+; CHECK-NEXT: chi %r1, 9616
; CHECK-NEXT: lhi %r1, 0
; CHECK-NEXT: lochil %r1, 1
-; CHECK-NEXT: afi %r2, 65535
-; CHECK-NEXT: llhr %r3, %r2
-; CHECK-NEXT: lhi %r0, 0
-; CHECK-NEXT: cr %r3, %r2
-; CHECK-NEXT: lochilh %r0, 1
-; CHECK-NEXT: ar %r0, %r1
-; CHECK-NEXT: st %r0, 0(%r4)
+; CHECK-NEXT: afi %r0, 65535
+; CHECK-NEXT: llhr %r2, %r0
+; CHECK-NEXT: lhi %r3, 0
+; CHECK-NEXT: cr %r2, %r0
+; CHECK-NEXT: lochilh %r3, 1
+; CHECK-NEXT: ar %r1, %r3
+; CHECK-NEXT: st %r1, 0(%r4)
; CHECK-NEXT: br %r14
bb:
%tmp = icmp ult i16 %arg0, 9616
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll
index 6228ffaa35fa2..cb6e9ca81bcfa 100644
--- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs-zEC12.ll
@@ -212,13 +212,13 @@ define <4 x i32> @vec128_and_f(<4 x i32> %cc_dep1) {
; CHECK-NEXT: aghi %r15, -176
; CHECK-NEXT: .cfi_def_cfa_offset 336
; CHECK-NEXT: # kill: def $r4l killed $r4l def $r4d
+; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d
; CHECK-NEXT: sllg %r0, %r4, 32
; CHECK-NEXT: lr %r0, %r5
-; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d
+; CHECK-NEXT: sllg %r1, %r2, 32
+; CHECK-NEXT: lr %r1, %r3
; CHECK-NEXT: stg %r0, 168(%r15)
-; CHECK-NEXT: sllg %r0, %r2, 32
-; CHECK-NEXT: lr %r0, %r3
-; CHECK-NEXT: stg %r0, 160(%r15)
+; CHECK-NEXT: stg %r1, 160(%r15)
; CHECK-NEXT: ld %f0, 160(%r15)
; CHECK-NEXT: ld %f2, 168(%r15)
; CHECK-NEXT: #APP
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll
index 19969ccf4e297..7f06f66e6411e 100644
--- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-zEC12.ll
@@ -205,13 +205,13 @@ define <4 x i32> @vec128_and_f(<4 x i32> %cc_dep1) {
; CHECK-NEXT: aghi %r15, -176
; CHECK-NEXT: .cfi_def_cfa_offset 336
; CHECK-NEXT: # kill: def $r4l killed $r4l def $r4d
+; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d
; CHECK-NEXT: sllg %r0, %r4, 32
; CHECK-NEXT: lr %r0, %r5
-; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d
+; CHECK-NEXT: sllg %r1, %r2, 32
+; CHECK-NEXT: lr %r1, %r3
; CHECK-NEXT: stg %r0, 168(%r15)
-; CHECK-NEXT: sllg %r0, %r2, 32
-; CHECK-NEXT: lr %r0, %r3
-; CHECK-NEXT: stg %r0, 160(%r15)
+; CHECK-NEXT: stg %r1, 160(%r15)
; CHECK-NEXT: ld %f0, 160(%r15)
; CHECK-NEXT: ld %f2, 168(%r15)
; CHECK-NEXT: #APP
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-14.ll b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
index 98dc88f289620..72655ef9fbf27 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-14.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
@@ -343,13 +343,13 @@ define i128 @f25(i1 %a) {
define i128 @f26(ptr %ptr) {
; CHECK-LABEL: f26:
; CHECK: # %bb.0:
-; CHECK-NEXT: vgbm %v1, 0
-; CHECK-NEXT: vleb %v1, 0(%r3), 15
-; CHECK-NEXT: larl %r1, .LCPI25_0
-; CHECK-NEXT: vl %v2, 0(%r1), 3
; CHECK-NEXT: vgbm %v0, 0
-; CHECK-NEXT: vn %v1, %v1, %v2
-; CHECK-NEXT: vsq %v0, %v0, %v1
+; CHECK-NEXT: vleb %v0, 0(%r3), 15
+; CHECK-NEXT: larl %r1, .LCPI25_0
+; CHECK-NEXT: vl %v1, 0(%r1), 3
+; CHECK-NEXT: vgbm %v2, 0
+; CHECK-NEXT: vn %v0, %v0, %v1
+; CHECK-NEXT: vsq %v0, %v2, %v0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%a = load i1, ptr %ptr
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-12.ll b/llvm/test/CodeGen/SystemZ/int-mul-12.ll
index e7005f50a12fd..96132ad36f8b4 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-12.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-12.ll
@@ -7,21 +7,20 @@
define i128 @f1(i128 %a, i128 %b) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
-; CHECK-NEXT: .cfi_offset %r12, -64
-; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: stmg %r14, %r15, 112(%r15)
+; CHECK-NEXT: .cfi_offset %r14, -48
; CHECK-NEXT: .cfi_offset %r15, -40
-; CHECK-NEXT: lg %r13, 8(%r3)
-; CHECK-NEXT: lg %r0, 8(%r4)
-; CHECK-NEXT: lgr %r1, %r13
-; CHECK-NEXT: mlgr %r12, %r0
-; CHECK-NEXT: msg %r1, 0(%r4)
-; CHECK-NEXT: msg %r0, 0(%r3)
-; CHECK-NEXT: agr %r1, %r12
-; CHECK-NEXT: agr %r0, %r1
-; CHECK-NEXT: stg %r13, 8(%r2)
-; CHECK-NEXT: stg %r0, 0(%r2)
-; CHECK-NEXT: lmg %r12, %r15, 96(%r15)
+; CHECK-NEXT: lg %r1, 8(%r3)
+; CHECK-NEXT: lg %r5, 8(%r4)
+; CHECK-NEXT: lgr %r14, %r1
+; CHECK-NEXT: mlgr %r0, %r5
+; CHECK-NEXT: msg %r14, 0(%r4)
+; CHECK-NEXT: msg %r5, 0(%r3)
+; CHECK-NEXT: agr %r14, %r0
+; CHECK-NEXT: agr %r5, %r14
+; CHECK-NEXT: stg %r1, 8(%r2)
+; CHECK-NEXT: stg %r5, 0(%r2)
+; CHECK-NEXT: lmg %r14, %r15, 112(%r15)
; CHECK-NEXT: br %r14
%res = mul i128 %a, %b
ret i128 %res
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-13.ll b/llvm/test/CodeGen/SystemZ/int-mul-13.ll
index 82937cf66c629..96b4838dba524 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-13.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-13.ll
@@ -24,12 +24,12 @@ define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
-; CHECK-NEXT: srag %r1, %r4, 63
+; CHECK-NEXT: srag %r0, %r4, 63
; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q
-; CHECK-NEXT: srag %r0, %r3, 63
-; CHECK-NEXT: ngr %r1, %r3
+; CHECK-NEXT: ngr %r0, %r3
+; CHECK-NEXT: srag %r1, %r3, 63
; CHECK-NEXT: mlgr %r2, %r4
-; CHECK-NEXT: ngr %r0, %r4
+; CHECK-NEXT: ngr %r1, %r4
; CHECK-NEXT: agr %r0, %r1
; CHECK-NEXT: sgr %r2, %r0
; CHECK-NEXT: br %r14
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-15.ll b/llvm/test/CodeGen/SystemZ/int-mul-15.ll
index b7d41412d9c5f..01b7a21540491 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-15.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-15.ll
@@ -43,10 +43,10 @@ define i64 @f3(i64 %dummy, i64 %a, i64 %b) {
; CHECK-NEXT: vgbm %v1, 0
; CHECK-NEXT: vlvgg %v1, %r3, 1
; CHECK-NEXT: vlvgg %v0, %r4, 1
+; CHECK-NEXT: vrepib %v2, 67
; CHECK-NEXT: vmlq %v0, %v1, %v0
-; CHECK-NEXT: vrepib %v1, 67
-; CHECK-NEXT: vsrlb %v0, %v0, %v1
-; CHECK-NEXT: vsrl %v0, %v0, %v1
+; CHECK-NEXT: vsrlb %v0, %v0, %v2
+; CHECK-NEXT: vsrl %v0, %v0, %v2
; CHECK-NEXT: vlgvg %r2, %v0, 1
; CHECK-NEXT: br %r14
%ax = zext i64 %a to i128
@@ -66,9 +66,9 @@ define i64 @f4(i64 %dummy, i64 %a, i64 %b) {
; CHECK-NEXT: vgbm %v1, 0
; CHECK-NEXT: vlvgg %v1, %r3, 1
; CHECK-NEXT: vlvgg %v0, %r4, 1
+; CHECK-NEXT: vrepib %v2, 64
; CHECK-NEXT: vmlq %v0, %v1, %v0
-; CHECK-NEXT: vrepib %v1, 64
-; CHECK-NEXT: vsrlb %v1, %v0, %v1
+; CHECK-NEXT: vsrlb %v1, %v0, %v2
; CHECK-NEXT: vo %v0, %v1, %v0
; CHECK-NEXT: vlgvg %r2, %v0, 1
; CHECK-NEXT: br %r14
diff --git a/llvm/test/CodeGen/SystemZ/int-uadd-14.ll b/llvm/test/CodeGen/SystemZ/int-uadd-14.ll
index c8873a4dfadef..2e8eee65edf59 100644
--- a/llvm/test/CodeGen/SystemZ/int-uadd-14.ll
+++ b/llvm/test/CodeGen/SystemZ/int-uadd-14.ll
@@ -6,17 +6,17 @@
define zeroext i1 @f1(i256 %a, i256 %b, ptr %res) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v2, 16(%r3), 3
-; CHECK-NEXT: vl %v3, 16(%r2), 3
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vl %v1, 0(%r2), 3
-; CHECK-NEXT: vaccq %v4, %v3, %v2
-; CHECK-NEXT: vacccq %v5, %v1, %v0, %v4
+; CHECK-NEXT: vl %v0, 16(%r3), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vl %v3, 0(%r2), 3
+; CHECK-NEXT: vaccq %v4, %v1, %v0
+; CHECK-NEXT: vaq %v0, %v1, %v0
+; CHECK-NEXT: vacccq %v5, %v3, %v2, %v4
; CHECK-NEXT: vlgvg %r2, %v5, 1
-; CHECK-NEXT: vacq %v0, %v1, %v0, %v4
-; CHECK-NEXT: vaq %v1, %v3, %v2
-; CHECK-NEXT: vst %v1, 16(%r4), 3
-; CHECK-NEXT: vst %v0, 0(%r4), 3
+; CHECK-NEXT: vacq %v2, %v3, %v2, %v4
+; CHECK-NEXT: vst %v0, 16(%r4), 3
+; CHECK-NEXT: vst %v2, 0(%r4), 3
; CHECK-NEXT: br %r14
%t = call {i256, i1} @llvm.uadd.with.overflow.i256(i256 %a, i256 %b)
%val = extractvalue {i256, i1} %t, 0
@@ -44,15 +44,15 @@ define zeroext i1 @f2(i256 %a, i256 %b) {
define i256 @f3(i256 %a, i256 %b) {
; CHECK-LABEL: f3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v2, 16(%r4), 3
-; CHECK-NEXT: vl %v3, 16(%r3), 3
-; CHECK-NEXT: vl %v0, 0(%r4), 3
-; CHECK-NEXT: vl %v1, 0(%r3), 3
-; CHECK-NEXT: vaccq %v4, %v3, %v2
-; CHECK-NEXT: vacq %v0, %v1, %v0, %v4
-; CHECK-NEXT: vaq %v1, %v3, %v2
-; CHECK-NEXT: vst %v1, 16(%r2), 3
-; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v0, 16(%r4), 3
+; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: vl %v2, 0(%r4), 3
+; CHECK-NEXT: vl %v3, 0(%r3), 3
+; CHECK-NEXT: vaccq %v4, %v1, %v0
+; CHECK-NEXT: vaq %v0, %v1, %v0
+; CHECK-NEXT: vacq %v2, %v3, %v2, %v4
+; CHECK-NEXT: vst %v0, 16(%r2), 3
+; CHECK-NEXT: vst %v2, 0(%r2), 3
; CHECK-NEXT: br %r14
%t = call {i256, i1} @llvm.uadd.with.overflow.i256(i256 %a, i256 %b)
%val = extractvalue {i256, i1} %t, 0
diff --git a/llvm/test/CodeGen/SystemZ/int-usub-13.ll b/llvm/test/CodeGen/SystemZ/int-usub-13.ll
index 794af3b73fbe2..ebdda5c69ecbd 100644
--- a/llvm/test/CodeGen/SystemZ/int-usub-13.ll
+++ b/llvm/test/CodeGen/SystemZ/int-usub-13.ll
@@ -6,18 +6,18 @@
define zeroext i1 @f1(i256 %a, i256 %b, ptr %res) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v2, 16(%r3), 3
-; CHECK-NEXT: vl %v3, 16(%r2), 3
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vl %v1, 0(%r2), 3
-; CHECK-NEXT: vscbiq %v4, %v3, %v2
-; CHECK-NEXT: vsbcbiq %v5, %v1, %v0, %v4
+; CHECK-NEXT: vl %v0, 16(%r3), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vl %v3, 0(%r2), 3
+; CHECK-NEXT: vscbiq %v4, %v1, %v0
+; CHECK-NEXT: vsq %v0, %v1, %v0
+; CHECK-NEXT: vsbcbiq %v5, %v3, %v2, %v4
; CHECK-NEXT: vlgvg %r2, %v5, 1
-; CHECK-NEXT: vsbiq %v0, %v1, %v0, %v4
-; CHECK-NEXT: vsq %v1, %v3, %v2
+; CHECK-NEXT: vsbiq %v2, %v3, %v2, %v4
; CHECK-NEXT: xilf %r2, 1
-; CHECK-NEXT: vst %v1, 16(%r4), 3
-; CHECK-NEXT: vst %v0, 0(%r4), 3
+; CHECK-NEXT: vst %v0, 16(%r4), 3
+; CHECK-NEXT: vst %v2, 0(%r4), 3
; CHECK-NEXT: br %r14
%t = call {i256, i1} @llvm.usub.with.overflow.i256(i256 %a, i256 %b)
%val = extractvalue {i256, i1} %t, 0
@@ -29,12 +29,12 @@ define zeroext i1 @f1(i256 %a, i256 %b, ptr %res) {
define zeroext i1 @f2(i256 %a, i256 %b) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v2, 16(%r3), 3
-; CHECK-NEXT: vl %v3, 16(%r2), 3
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vl %v1, 0(%r2), 3
-; CHECK-NEXT: vscbiq %v2, %v3, %v2
-; CHECK-NEXT: vsbcbiq %v0, %v1, %v0, %v2
+; CHECK-NEXT: vl %v0, 16(%r3), 3
+; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vl %v3, 0(%r2), 3
+; CHECK-NEXT: vscbiq %v0, %v1, %v0
+; CHECK-NEXT: vsbcbiq %v0, %v3, %v2, %v0
; CHECK-NEXT: vlgvg %r2, %v0, 1
; CHECK-NEXT: xilf %r2, 1
; CHECK-NEXT: br %r14
@@ -46,15 +46,15 @@ define zeroext i1 @f2(i256 %a, i256 %b) {
define i256 @f3(i256 %a, i256 %b) {
; CHECK-LABEL: f3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v2, 16(%r4), 3
-; CHECK-NEXT: vl %v3, 16(%r3), 3
-; CHECK-NEXT: vl %v0, 0(%r4), 3
-; CHECK-NEXT: vl %v1, 0(%r3), 3
-; CHECK-NEXT: vscbiq %v4, %v3, %v2
-; CHECK-NEXT: vsbiq %v0, %v1, %v0, %v4
-; CHECK-NEXT: vsq %v1, %v3, %v2
-; CHECK-NEXT: vst %v1, 16(%r2), 3
-; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: vl %v0, 16(%r4), 3
+; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: vl %v2, 0(%r4), 3
+; CHECK-NEXT: vl %v3, 0(%r3), 3
+; CHECK-NEXT: vscbiq %v4, %v1, %v0
+; CHECK-NEXT: vsq %v0, %v1, %v0
+; CHECK-NEXT: vsbiq %v2, %v3, %v2, %v4
+; CHECK-NEXT: vst %v0, 16(%r2), 3
+; CHECK-NEXT: vst %v2, 0(%r2), 3
; CHECK-NEXT: br %r14
%t = call {i256, i1} @llvm.usub.with.overflow.i256(i256 %a, i256 %b)
%val = extractvalue {i256, i1} %t, 0
diff --git a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll
index fdf1be68a5430..ec2e3b946659e 100644
--- a/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll
+++ b/llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll
@@ -11,11 +11,11 @@ define double @fun0_fadd(ptr %x) {
; CHECK-NEXT: adb %f0, 8(%r2)
; CHECK-NEXT: ld %f1, 24(%r2)
; CHECK-NEXT: adb %f1, 16(%r2)
+; CHECK-NEXT: ld %f2, 40(%r2)
+; CHECK-NEXT: adb %f2, 32(%r2)
+; CHECK-NEXT: adb %f2, 48(%r2)
; CHECK-NEXT: adbr %f0, %f1
-; CHECK-NEXT: ld %f1, 40(%r2)
-; CHECK-NEXT: adb %f1, 32(%r2)
-; CHECK-NEXT: adb %f1, 48(%r2)
-; CHECK-NEXT: adbr %f0, %f1
+; CHECK-NEXT: adbr %f0, %f2
; CHECK-NEXT: adb %f0, 56(%r2)
; CHECK-NEXT: br %r14
entry:
@@ -51,11 +51,11 @@ define float @fun1_fadd(ptr %x) {
; CHECK-NEXT: aeb %f0, 4(%r2)
; CHECK-NEXT: lde %f1, 12(%r2)
; CHECK-NEXT: aeb %f1, 8(%r2)
+; CHECK-NEXT: lde %f2, 20(%r2)
+; CHECK-NEXT: aeb %f2, 16(%r2)
+; CHECK-NEXT: aeb %f2, 24(%r2)
; CHECK-NEXT: aebr %f0, %f1
-; CHECK-NEXT: lde %f1, 20(%r2)
-; CHECK-NEXT: aeb %f1, 16(%r2)
-; CHECK-NEXT: aeb %f1, 24(%r2)
-; CHECK-NEXT: aebr %f0, %f1
+; CHECK-NEXT: aebr %f0, %f2
; CHECK-NEXT: aeb %f0, 28(%r2)
; CHECK-NEXT: br %r14
entry:
@@ -89,16 +89,16 @@ define fp128 @fun2_fadd(ptr %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: vl %v2, 32(%r3), 3
+; CHECK-NEXT: vl %v3, 48(%r3), 3
+; CHECK-NEXT: vl %v4, 64(%r3), 3
+; CHECK-NEXT: vl %v5, 80(%r3), 3
; CHECK-NEXT: wfaxb %v0, %v1, %v0
-; CHECK-NEXT: vl %v1, 32(%r3), 3
-; CHECK-NEXT: vl %v2, 48(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfaxb %v1, %v2, %v3
+; CHECK-NEXT: wfaxb %v2, %v4, %v5
+; CHECK-NEXT: vl %v3, 96(%r3), 3
; CHECK-NEXT: wfaxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r3), 3
-; CHECK-NEXT: vl %v2, 80(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfaxb %v1, %v2, %v3
; CHECK-NEXT: wfaxb %v0, %v0, %v1
; CHECK-NEXT: vl %v1, 112(%r3), 3
; CHECK-NEXT: wfaxb %v0, %v0, %v1
@@ -135,16 +135,16 @@ define <2 x double> @fun3_fadd(ptr %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl %v0, 0(%r2), 3
; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vl %v2, 32(%r2), 3
+; CHECK-NEXT: vl %v3, 48(%r2), 3
+; CHECK-NEXT: vl %v4, 64(%r2), 3
+; CHECK-NEXT: vl %v5, 80(%r2), 3
; CHECK-NEXT: vfadb %v0, %v1, %v0
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfadb %v1, %v2, %v3
+; CHECK-NEXT: vfadb %v2, %v4, %v5
+; CHECK-NEXT: vl %v3, 96(%r2), 3
; CHECK-NEXT: vfadb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfadb %v1, %v2, %v3
; CHECK-NEXT: vfadb %v0, %v0, %v1
; CHECK-NEXT: vl %v1, 112(%r2), 3
; CHECK-NEXT: vfadb %v24, %v0, %v1
@@ -180,16 +180,16 @@ define <4 x float> @fun4_fadd(ptr %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl %v0, 0(%r2), 3
; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vl %v2, 32(%r2), 3
+; CHECK-NEXT: vl %v3, 48(%r2), 3
+; CHECK-NEXT: vl %v4, 64(%r2), 3
+; CHECK-NEXT: vl %v5, 80(%r2), 3
; CHECK-NEXT: vfasb %v0, %v1, %v0
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfasb %v1, %v2, %v3
+; CHECK-NEXT: vfasb %v2, %v4, %v5
+; CHECK-NEXT: vl %v3, 96(%r2), 3
; CHECK-NEXT: vfasb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfasb %v1, %v2, %v3
; CHECK-NEXT: vfasb %v0, %v0, %v1
; CHECK-NEXT: vl %v1, 112(%r2), 3
; CHECK-NEXT: vfasb %v24, %v0, %v1
@@ -227,11 +227,11 @@ define double @fun5_fsub(ptr %x) {
; CHECK-NEXT: sdb %f0, 8(%r2)
; CHECK-NEXT: ld %f1, 24(%r2)
; CHECK-NEXT: adb %f1, 16(%r2)
+; CHECK-NEXT: ld %f2, 40(%r2)
+; CHECK-NEXT: adb %f2, 32(%r2)
+; CHECK-NEXT: adb %f2, 48(%r2)
; CHECK-NEXT: sdbr %f0, %f1
-; CHECK-NEXT: ld %f1, 40(%r2)
-; CHECK-NEXT: adb %f1, 32(%r2)
-; CHECK-NEXT: adb %f1, 48(%r2)
-; CHECK-NEXT: sdbr %f0, %f1
+; CHECK-NEXT: sdbr %f0, %f2
; CHECK-NEXT: sdb %f0, 56(%r2)
; CHECK-NEXT: br %r14
entry:
@@ -267,11 +267,11 @@ define float @fun6_fsub(ptr %x) {
; CHECK-NEXT: seb %f0, 4(%r2)
; CHECK-NEXT: lde %f1, 12(%r2)
; CHECK-NEXT: aeb %f1, 8(%r2)
+; CHECK-NEXT: lde %f2, 20(%r2)
+; CHECK-NEXT: aeb %f2, 16(%r2)
+; CHECK-NEXT: aeb %f2, 24(%r2)
; CHECK-NEXT: sebr %f0, %f1
-; CHECK-NEXT: lde %f1, 20(%r2)
-; CHECK-NEXT: aeb %f1, 16(%r2)
-; CHECK-NEXT: aeb %f1, 24(%r2)
-; CHECK-NEXT: sebr %f0, %f1
+; CHECK-NEXT: sebr %f0, %f2
; CHECK-NEXT: seb %f0, 28(%r2)
; CHECK-NEXT: br %r14
entry:
@@ -305,16 +305,16 @@ define fp128 @fun7_fsub(ptr %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: vl %v2, 32(%r3), 3
+; CHECK-NEXT: vl %v3, 48(%r3), 3
+; CHECK-NEXT: vl %v4, 64(%r3), 3
+; CHECK-NEXT: vl %v5, 80(%r3), 3
; CHECK-NEXT: wfsxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r3), 3
-; CHECK-NEXT: vl %v2, 48(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfaxb %v1, %v2, %v3
+; CHECK-NEXT: wfaxb %v2, %v4, %v5
+; CHECK-NEXT: vl %v3, 96(%r3), 3
; CHECK-NEXT: wfsxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r3), 3
-; CHECK-NEXT: vl %v2, 80(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r3), 3
-; CHECK-NEXT: wfaxb %v1, %v1, %v2
+; CHECK-NEXT: wfaxb %v1, %v2, %v3
; CHECK-NEXT: wfsxb %v0, %v0, %v1
; CHECK-NEXT: vl %v1, 112(%r3), 3
; CHECK-NEXT: wfsxb %v0, %v0, %v1
@@ -351,16 +351,16 @@ define <2 x double> @fun8_fsub(ptr %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl %v0, 0(%r2), 3
; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vl %v2, 32(%r2), 3
+; CHECK-NEXT: vl %v3, 48(%r2), 3
+; CHECK-NEXT: vl %v4, 64(%r2), 3
+; CHECK-NEXT: vl %v5, 80(%r2), 3
; CHECK-NEXT: vfsdb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfadb %v1, %v2, %v3
+; CHECK-NEXT: vfadb %v2, %v4, %v5
+; CHECK-NEXT: vl %v3, 96(%r2), 3
; CHECK-NEXT: vfsdb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfadb %v1, %v1, %v2
+; CHECK-NEXT: vfadb %v1, %v2, %v3
; CHECK-NEXT: vfsdb %v0, %v0, %v1
; CHECK-NEXT: vl %v1, 112(%r2), 3
; CHECK-NEXT: vfsdb %v24, %v0, %v1
@@ -396,16 +396,16 @@ define <4 x float> @fun9_fsub(ptr %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl %v0, 0(%r2), 3
; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vl %v2, 32(%r2), 3
+; CHECK-NEXT: vl %v3, 48(%r2), 3
+; CHECK-NEXT: vl %v4, 64(%r2), 3
+; CHECK-NEXT: vl %v5, 80(%r2), 3
; CHECK-NEXT: vfssb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfasb %v1, %v2, %v3
+; CHECK-NEXT: vfasb %v2, %v4, %v5
+; CHECK-NEXT: vl %v3, 96(%r2), 3
; CHECK-NEXT: vfssb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfasb %v1, %v1, %v2
+; CHECK-NEXT: vfasb %v1, %v2, %v3
; CHECK-NEXT: vfssb %v0, %v0, %v1
; CHECK-NEXT: vl %v1, 112(%r2), 3
; CHECK-NEXT: vfssb %v24, %v0, %v1
@@ -443,11 +443,11 @@ define double @fun10_fmul(ptr %x) {
; CHECK-NEXT: mdb %f0, 0(%r2)
; CHECK-NEXT: ld %f1, 24(%r2)
; CHECK-NEXT: mdb %f1, 16(%r2)
+; CHECK-NEXT: ld %f2, 40(%r2)
+; CHECK-NEXT: mdb %f2, 32(%r2)
+; CHECK-NEXT: mdb %f2, 48(%r2)
; CHECK-NEXT: mdbr %f0, %f1
-; CHECK-NEXT: ld %f1, 40(%r2)
-; CHECK-NEXT: mdb %f1, 32(%r2)
-; CHECK-NEXT: mdb %f1, 48(%r2)
-; CHECK-NEXT: mdbr %f0, %f1
+; CHECK-NEXT: mdbr %f0, %f2
; CHECK-NEXT: mdb %f0, 56(%r2)
; CHECK-NEXT: br %r14
entry:
@@ -483,11 +483,11 @@ define float @fun11_fmul(ptr %x) {
; CHECK-NEXT: meeb %f0, 0(%r2)
; CHECK-NEXT: lde %f1, 12(%r2)
; CHECK-NEXT: meeb %f1, 8(%r2)
+; CHECK-NEXT: lde %f2, 20(%r2)
+; CHECK-NEXT: meeb %f2, 16(%r2)
+; CHECK-NEXT: meeb %f2, 24(%r2)
; CHECK-NEXT: meebr %f0, %f1
-; CHECK-NEXT: lde %f1, 20(%r2)
-; CHECK-NEXT: meeb %f1, 16(%r2)
-; CHECK-NEXT: meeb %f1, 24(%r2)
-; CHECK-NEXT: meebr %f0, %f1
+; CHECK-NEXT: meebr %f0, %f2
; CHECK-NEXT: meeb %f0, 28(%r2)
; CHECK-NEXT: br %r14
entry:
@@ -521,16 +521,16 @@ define fp128 @fun12_fmul(ptr %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vl %v1, 16(%r3), 3
+; CHECK-NEXT: vl %v2, 32(%r3), 3
+; CHECK-NEXT: vl %v3, 48(%r3), 3
+; CHECK-NEXT: vl %v4, 64(%r3), 3
+; CHECK-NEXT: vl %v5, 80(%r3), 3
; CHECK-NEXT: wfmxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r3), 3
-; CHECK-NEXT: vl %v2, 48(%r3), 3
-; CHECK-NEXT: wfmxb %v1, %v1, %v2
+; CHECK-NEXT: wfmxb %v1, %v2, %v3
+; CHECK-NEXT: wfmxb %v2, %v4, %v5
+; CHECK-NEXT: vl %v3, 96(%r3), 3
; CHECK-NEXT: wfmxb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r3), 3
-; CHECK-NEXT: vl %v2, 80(%r3), 3
-; CHECK-NEXT: wfmxb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r3), 3
-; CHECK-NEXT: wfmxb %v1, %v1, %v2
+; CHECK-NEXT: wfmxb %v1, %v2, %v3
; CHECK-NEXT: wfmxb %v0, %v0, %v1
; CHECK-NEXT: vl %v1, 112(%r3), 3
; CHECK-NEXT: wfmxb %v0, %v0, %v1
@@ -567,16 +567,16 @@ define <2 x double> @fun13_fmul(ptr %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl %v0, 0(%r2), 3
; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vl %v2, 32(%r2), 3
+; CHECK-NEXT: vl %v3, 48(%r2), 3
+; CHECK-NEXT: vl %v4, 64(%r2), 3
+; CHECK-NEXT: vl %v5, 80(%r2), 3
; CHECK-NEXT: vfmdb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfmdb %v1, %v1, %v2
+; CHECK-NEXT: vfmdb %v1, %v2, %v3
+; CHECK-NEXT: vfmdb %v2, %v4, %v5
+; CHECK-NEXT: vl %v3, 96(%r2), 3
; CHECK-NEXT: vfmdb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfmdb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfmdb %v1, %v1, %v2
+; CHECK-NEXT: vfmdb %v1, %v2, %v3
; CHECK-NEXT: vfmdb %v0, %v0, %v1
; CHECK-NEXT: vl %v1, 112(%r2), 3
; CHECK-NEXT: vfmdb %v24, %v0, %v1
@@ -612,16 +612,16 @@ define <4 x float> @fun14_fmul(ptr %x) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vl %v0, 0(%r2), 3
; CHECK-NEXT: vl %v1, 16(%r2), 3
+; CHECK-NEXT: vl %v2, 32(%r2), 3
+; CHECK-NEXT: vl %v3, 48(%r2), 3
+; CHECK-NEXT: vl %v4, 64(%r2), 3
+; CHECK-NEXT: vl %v5, 80(%r2), 3
; CHECK-NEXT: vfmsb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 32(%r2), 3
-; CHECK-NEXT: vl %v2, 48(%r2), 3
-; CHECK-NEXT: vfmsb %v1, %v1, %v2
+; CHECK-NEXT: vfmsb %v1, %v2, %v3
+; CHECK-NEXT: vfmsb %v2, %v4, %v5
+; CHECK-NEXT: vl %v3, 96(%r2), 3
; CHECK-NEXT: vfmsb %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 64(%r2), 3
-; CHECK-NEXT: vl %v2, 80(%r2), 3
-; CHECK-NEXT: vfmsb %v1, %v1, %v2
-; CHECK-NEXT: vl %v2, 96(%r2), 3
-; CHECK-NEXT: vfmsb %v1, %v1, %v2
+; CHECK-NEXT: vfmsb %v1, %v2, %v3
; CHECK-NEXT: vfmsb %v0, %v0, %v1
; CHECK-NEXT: vl %v1, 112(%r2), 3
; CHECK-NEXT: vfmsb %v24, %v0, %v1
diff --git a/llvm/test/CodeGen/SystemZ/memcpy-01.ll b/llvm/test/CodeGen/SystemZ/memcpy-01.ll
index cabbfb40acb9a..e62f3eea58efc 100644
--- a/llvm/test/CodeGen/SystemZ/memcpy-01.ll
+++ b/llvm/test/CodeGen/SystemZ/memcpy-01.ll
@@ -149,8 +149,8 @@ define void @f13() {
; CHECK-LABEL: f13:
; CHECK: brasl %r14, foo at PLT
; CHECK: mvc 200(256,%r15), 3826(%r15)
-; CHECK: mvc 456(256,%r15), 4082(%r15)
-; CHECK: lay [[NEWSRC:%r[1-5]]], 4338(%r15)
+; CHECK-DAG: mvc 456(256,%r15), 4082(%r15)
+; CHECK-DAG: lay [[NEWSRC:%r[1-5]]], 4338(%r15)
; CHECK: mvc 712(256,%r15), 0([[NEWSRC]])
; CHECK: mvc 968(256,%r15), 256([[NEWSRC]])
; CHECK: mvc 1224(255,%r15), 512([[NEWSRC]])
diff --git a/llvm/test/CodeGen/SystemZ/memset-01.ll b/llvm/test/CodeGen/SystemZ/memset-01.ll
index 535ccfd7b9e28..06a34e0297868 100644
--- a/llvm/test/CodeGen/SystemZ/memset-01.ll
+++ b/llvm/test/CodeGen/SystemZ/memset-01.ll
@@ -192,8 +192,8 @@ define void @f17(ptr %dest, i8 %val) {
; CHECK-NEXT: mvc 3584(255,%r2), 3583(%r2)
; CHECK-NEXT: stc %r3, 3839(%r2)
; CHECK-NEXT: mvc 3840(255,%r2), 3839(%r2)
-; CHECK-NEXT: lay %r1, 4096(%r2)
-; CHECK-NEXT: stc %r3, 4095(%r2)
+; CHECK-DAG: lay %r1, 4096(%r2)
+; CHECK-DAG: stc %r3, 4095(%r2)
; CHECK-NEXT: mvc 0(1,%r1), 4095(%r2)
; CHECK-NEXT: br %r14
%addr = getelementptr i8, ptr %dest, i64 3583
diff --git a/llvm/test/CodeGen/SystemZ/misched-prera-biaspregs.mir b/llvm/test/CodeGen/SystemZ/misched-prera-biaspregs.mir
new file mode 100644
index 0000000000000..a073833d1e3f2
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/misched-prera-biaspregs.mir
@@ -0,0 +1,87 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+# RUN: -run-pass=machine-scheduler 2>&1 | FileCheck %s
+
+# The COPY to r2 should be right before the return.
+# CHECK: name: fun0
+# CHECK: $r2d = COPY %0
+# CHECK-NEXT: Return implicit $r2d
+---
+name: fun0
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ %0:gr64bit = LGHI 0
+ $r2d = COPY %0
+ %1:gr64bit = LGHI 0
+ Return implicit $r2d
+...
+
+# The COPY from r3 should be first.
+# CHECK: name: fun1
+# CHECK: liveins: $r3d
+# CHECK-NEXT: {{ $}}
+# CHECK-NEXT: %2:gr64bit = COPY $r3d
+---
+name: fun1
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $r3d
+
+ %0:gr64bit = LGHI 1
+ %1:gr64bit = COPY %0
+ %2:gr64bit = COPY %1
+ %2:gr64bit = COPY $r3d
+ $r2d = COPY %2
+ Return implicit $r2d
+...
+
+# The LGHI to r2 should be right before the return.
+# CHECK: name: fun2
+# CHECK: $r2d = LGHI 0
+# CHECK-NEXT: Return implicit $r2d
+---
+name: fun2
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ %0:gr64bit = LGHI 0
+ $r2d = LGHI 0
+ %1:gr64bit = LGHI 0
+ Return implicit $r2d
+...
+
+# The LA to r2 should be right before the return.
+# CHECK: name: fun3
+# CHECK: $r2d = LA %stack.0, 0, $noreg
+# CHECK-NEXT: Return implicit $r2d
+---
+name: fun3
+tracksRegLiveness: true
+stack:
+ - { id: 0, size: 8 }
+body: |
+ bb.0:
+ $r2d = LA %stack.0, 0, $noreg
+ %0:gr64bit = LGHI 0
+ Return implicit killed $r2d
+...
+
+# Don't reorder phys-reg COPYs.
+# CHECK: name: fun4
+# CHECK: %0:gr64bit = COPY $r2d
+# CHECK-NEXT: %1:gr64bit = COPY $r3d
+---
+name: fun4
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $r2d, $r3d
+
+ %0:gr64bit = COPY $r2d
+ %1:gr64bit = COPY $r3d
+ %2:gr64bit = AGRK %0, %0, implicit-def $cc
+ %3:gr64bit = AGRK %2, %0, implicit-def $cc
+
+ Return
+...
diff --git a/llvm/test/CodeGen/SystemZ/misched-prera-copy-coal.mir b/llvm/test/CodeGen/SystemZ/misched-prera-copy-coal.mir
new file mode 100644
index 0000000000000..05a1fe699c584
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/misched-prera-copy-coal.mir
@@ -0,0 +1,31 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+# RUN: -run-pass=machine-scheduler 2>&1 | FileCheck %s
+
+# Respect the weak edge between the SLLK and CLFIMux. Only if the SLLK is scheduled
+# below can the COPY be coalesced.
+# CHECK: name: fun0
+# CHECK: CLFIMux %0, 0, implicit-def $cc
+# CHECK-NEXT: %1:gr32bit = SLLK %0, $noreg, 1
+# CHECK-NEXT: %0:gr32bit = COPY %1
+# CHECK-NEXT: BRC 14, 10, %bb.1, implicit killed $cc
+---
+name: fun0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ successors: %bb.1(0x80000000)
+
+ %0:gr32bit = LHIMux 0
+
+ bb.1:
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+ %1:gr32bit = SLLK %0, $noreg, 1
+ CLFIMux %0, 0, implicit-def $cc
+ %0:gr32bit = COPY %1
+ BRC 14, 10, %bb.1, implicit killed $cc
+ J %bb.2
+
+ bb.2:
+ Return
+...
diff --git a/llvm/test/CodeGen/SystemZ/misched-prera-latencies.mir b/llvm/test/CodeGen/SystemZ/misched-prera-latencies.mir
new file mode 100644
index 0000000000000..4b45d336d7e01
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/misched-prera-latencies.mir
@@ -0,0 +1,167 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+# RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler 2>&1\
+# RUN: | FileCheck %s
+
+# The CDFBR has a longer latency than the VL64, so schedule it above.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun0:%bb.0
+# CHECK: Region is not tiny.
+# CHECK: Latency scheduling enabled.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:gr32bit = COPY $r2l
+# CHECK-NEXT: SU(2): %2:fp64bit = nofpexcept CDFBR %0:gr32bit, implicit $fpc
+# CHECK-NEXT: SU(1): %1:vr64bit = VL64 $noreg, 0, $noreg
+# CHECK-NEXT: SU(3): undef %3.subreg_h64:vr128bit = nofpexcept WFDDB %1:vr64bit,
+---
+name: fun0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r2l
+
+ %0:gr32bit = COPY $r2l
+ %1:vr64bit = VL64 $noreg, 0, $noreg :: (load (s64) from `ptr null`)
+ %2:fp64bit = nofpexcept CDFBR %0, implicit $fpc
+ undef %3.subreg_h64:vr128bit = nofpexcept WFDDB %1, %2, implicit $fpc
+ %4:vr128bit = VGBM 0
+ %5:vr128bit = VMRHG %3, %4
+ VST %5, $noreg, 0, $noreg :: (store (s128) into `ptr null`, align 8)
+ Return
+...
+
+# The TMLMux will be scheduled first even though the LA is available and of lesser
+# height, because the TMLMux Depth equals the remaining latency (on CP).
+# Then, the AGHIK will follow even though it also have a Height of 1, because
+# it does not increase the scheduled latency as the TMLMux also has a Height of 1.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun1:%bb.0
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun1:%bb.1
+# CHECK: Region is not tiny.
+# CHECK: Latency scheduling enabled.
+# CHECK: SU(2): %3:addr64bit = LA %3:addr64bit, 1, $noreg
+# CHECK: Depth : 0
+# CHECK: Height : 0
+# CHECK: SU(9): dead %13:gr64bit = AGHIK %12:gr64bit, -1,
+# CHECK: Depth : 6
+# CHECK: Height : 1
+# CHECK: SU(10): TMLMux %0:gr32bit, 1, implicit-def $cc
+# CHECK: Depth : 6
+# CHECK: Height : 1
+# CHECK: Queue BotQ.A: 2 10
+# CHECK: Scheduling SU(10)
+# CHECK: Queue BotQ.A: 2 9 1
+# CHECK: Scheduling SU(9)
+# CHECK: *** Final schedule for %bb.1 ***
+# CHECK: SU(2): %3:addr64bit = LA %3:addr64bit, 1, $noreg
+# CHECK-NEXT: SU(9): dead %13:gr64bit = AGHIK %12:gr64bit, -1,
+# CHECK-NEXT: SU(10): TMLMux %0:gr32bit, 1, implicit-def $cc
+---
+name: fun1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $r2l, $r3l
+
+ %0:gr32bit = COPY $r3l
+ %1:gr32bit = COPY $r2l
+ %2:gr32bit = COPY %0
+ %3:addr64bit = LGHI 0
+ %4:gr64bit = COPY %3
+ %5:gr32bit = COPY %2
+ J %bb.1
+
+ bb.1:
+ successors: %bb.1(0x80000000)
+
+ %6:gr32bit = MSRKC %1, %5, implicit-def dead $cc
+ %2:gr32bit = ORK %6, %2, implicit-def dead $cc
+ %3:addr64bit = LA %3, 1, $noreg
+ %7:gr64bit = COPY %4 ; Make region non-tiny.
+ %8:gr64bit = COPY %7 ;
+ %9:gr64bit = COPY %8 ;
+ %10:gr64bit = COPY %9 ;
+ %11:gr64bit = COPY %10 ;
+ %12:gr64bit = COPY %11 ;
+ %13:gr64bit = AGHIK %12, -1, implicit-def dead $cc
+ TMLMux %0, 1, implicit-def $cc
+ BRC 15, 7, %bb.1, implicit killed $cc
+ Return
+...
+
+# This region has many nodes compared to the maximum height: Don't
+# care about heights/latencies. The AGRKs will remain interleaved with
+# the COPYs instead of all ending up at the bottom.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun2:%bb.0
+# CHECK: Region is not tiny.
+# CHECK-NEXT: Latency scheduling disabled.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:gr64bit = COPY undef %1:gr64bit
+# CHECK-NEXT: SU(1): dead %2:gr64bit = AGRK %0:gr64bit, %0:gr64bit,
+# CHECK-NEXT: SU(2): %3:gr64bit = COPY undef %1:gr64bit
+# CHECK-NEXT: SU(3): dead %4:gr64bit = AGRK %3:gr64bit, %3:gr64bit,
+# CHECK-NEXT: SU(4): %5:gr64bit = COPY undef %1:gr64bit
+# CHECK-NEXT: SU(5): dead %6:gr64bit = AGRK %5:gr64bit, %5:gr64bit,
+# CHECK-NEXT: SU(6): %7:gr64bit = COPY undef %1:gr64bit
+# CHECK-NEXT: SU(7): dead %8:gr64bit = AGRK %7:gr64bit, %7:gr64bit,
+# CHECK-NEXT: SU(8): %9:gr64bit = COPY undef %1:gr64bit
+# CHECK-NEXT: SU(9): dead %10:gr64bit = AGRK %9:gr64bit, %9:gr64bit,
+# CHECK-NEXT: SU(10): %11:gr64bit = COPY undef %1:gr64bit
+# CHECK-NEXT: SU(11): dead %12:gr64bit = AGRK %11:gr64bit, %11:gr64bit,
+---
+name: fun2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %1:gr64bit = COPY undef %0:gr64bit
+ %2:gr64bit = AGRK %1, %1, implicit-def dead $cc
+ %3:gr64bit = COPY undef %0:gr64bit
+ %4:gr64bit = AGRK %3, %3, implicit-def dead $cc
+ %5:gr64bit = COPY undef %0:gr64bit
+ %6:gr64bit = AGRK %5, %5, implicit-def dead $cc
+ %7:gr64bit = COPY undef %0:gr64bit
+ %8:gr64bit = AGRK %7, %7, implicit-def dead $cc
+ %9:gr64bit = COPY undef %0:gr64bit
+ %10:gr64bit = AGRK %9, %9, implicit-def dead $cc
+ %11:gr64bit = COPY undef %0:gr64bit
+ %12:gr64bit = AGRK %11, %11, implicit-def dead $cc
+ Return
+...
+
+# The first WFDDB is scheduled first as it is on the Critical Path below the
+# other WFDDB. The CDFBR is then scheduled below the VL64 per the original order
+# even though it has a greater height as neither of these SUs would extend the
+# scheduled latency.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun3:%bb.0
+# CHECK: Region is not tiny.
+# CHECK-NEXT: Latency scheduling enabled.
+# CHECK: SU(1): dead %1:vr64bit = VL64 $noreg, 0, $noreg ::
+# CHECK: Height : 3
+# CHECK: SU(2): dead %2:fp64bit = nofpexcept CDFBR %0:gr32bit,
+# CHECK: Height : 6
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK: SU(5): undef %6.subreg_h64:vr128bit = WFDDB %3:vr64bit, %4:vr64bit,
+# CHECK-NEXT: SU(1): dead %1:vr64bit = VL64 $noreg, 0, $noreg ::
+# CHECK-NEXT: SU(2): dead %2:fp64bit = nofpexcept CDFBR %0:gr32bit,
+# CHECK: SU(8): dead undef %10.subreg_h64:vr128bit = WFDDB %7:vr64bit,
+---
+name: fun3
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r2l
+
+ %0:gr32bit = COPY $r2l
+ %1:vr64bit = VL64 $noreg, 0, $noreg :: (load (s64) from `ptr null`)
+ %2:fp64bit = nofpexcept CDFBR %0, implicit $fpc
+ %3:vr64bit = COPY undef %3:vr64bit
+ %5:vr64bit = COPY undef %4:vr64bit
+ undef %6.subreg_h64:vr128bit = WFDDB %3, %5, implicit $fpc
+ %7:vr64bit = COPY %6.subreg_h64:vr128bit
+ %9:vr64bit = COPY undef %8:vr64bit
+ undef %10.subreg_h64:vr128bit = WFDDB %7, %9, implicit $fpc
+ Return
+...
diff --git a/llvm/test/CodeGen/SystemZ/misched-prera-loads.mir b/llvm/test/CodeGen/SystemZ/misched-prera-loads.mir
new file mode 100644
index 0000000000000..8c80320bc7ded
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/misched-prera-loads.mir
@@ -0,0 +1,391 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+# RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler 2>&1\
+# RUN: | FileCheck %s
+
+--- |
+
+ define void @fun0() { ret void }
+ define void @fun1() { ret void }
+ define void @fun2() { ret void }
+ define void @fun3() { ret void }
+ define void @fun4(ptr %Arg) { ret void }
+ define void @fun5(ptr %Arg) { ret void }
+ define void @fun6() { ret void }
+ define void @fun7() { ret void }
+ define void @fun8() { ret void }
+...
+
+# Schedule the LG low. The heurstic to move a load down to its user to
+# shorten the live range (of %1) makes sure to not increase the scheduled latency, so
+# therefore it ends up above the AG.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun0:%bb.0
+# CHECK: Region is not tiny.
+# CHECK-NEXT: Latency scheduling enabled.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK: SU(1): %1:addr64bit = LG $noreg, 0, $noreg
+# CHECK-NEXT: SU(7): dead %7:gr64bit = AG %6:gr64bit(tied-def 0),
+# CHECK-NEXT: SU(8): dead %8:addr64bit = LGHI 0
+# CHECK-NEXT: SU(9): STG %1:addr64bit, $noreg, 0, $noreg
+# CHECK-NEXT: SU(10): $r2d = LGHI 0
+---
+name: fun0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r2d
+
+ %0:addr64bit = COPY $r2d
+ %1:addr64bit = LG $noreg, 0, $noreg
+ %2:gr64bit = COPY %0
+ %3:gr64bit = LGHI 1
+ %4:gr64bit = COPY %3 ; Make region non-tiny.
+ %5:gr64bit = COPY %4 ;
+ %6:gr64bit = AGRK %2, %5, implicit-def $cc
+ dead %7:gr64bit = AG %6, $noreg, 0, $noreg, implicit-def dead $cc
+ dead %8:addr64bit = LGHI 0
+ STG %1, $noreg, 0, $noreg
+ $r2d = LGHI 0
+ Return
+...
+
+# Schedule the '%3 = WFMADB' as soon as its only user and its use operands are
+# already live:
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun1:%bb.0
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun1:%bb.0
+# CHECK: Region is not tiny.
+# CHECK-NEXT: Latency scheduling enabled.
+#
+# CHECK: Pick Bot LIVE-REDUC
+#
+# CHECK: Queue BotQ.A: 5 7 3
+# CHECK: Scheduling SU(7)
+# CHECK: Live regs was: %2, %3, %5, %7
+# CHECK: Queue BotQ.A: 5 3
+# CHECK: Pick Bot LIVE-REDUC
+# CHECK-NEXT: Scheduling SU(3) %3:vr64bit = nofpexcept WFMADB %2:fp64bit,
+# CHECK: Live regs was: %1, %2, %3, %5
+#
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK: SU(3): %3:vr64bit = nofpexcept WFMADB %2:fp64bit, %2:fp64bit, %1:fp64bit,
+# CHECK-NEXT: SU(7): %7:vr64bit = nofpexcept WFMADB %1:fp64bit, %2:fp64bit, %2:fp64bit,
+# CHECK-NEXT: SU(4): %4:vr64bit = nofpexcept WFMDB %3:vr64bit, %2:fp64bit,
+---
+name: fun1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $f0d, $f2d
+
+ %0:fp64bit = COPY $f2d
+ %1:fp64bit = COPY $f0d
+ %2:fp64bit = LZDR
+ %3:vr64bit = nofpexcept WFMADB %2, %2, %1, implicit $fpc
+ %4:vr64bit = nofpexcept WFMDB %3, %2, implicit $fpc
+ %5:vr64bit = nofpexcept WFMADB %2, %2, %0, implicit $fpc
+ %6:vr64bit = nofpexcept WFMDB %5, %2, implicit $fpc
+ %7:vr64bit = nofpexcept WFMADB %1, %2, %2, implicit $fpc
+ %8:vr64bit = nofpexcept WFADB %7, %2, implicit $fpc
+ %9:addr64bit = LGHI 0
+ $f0d = COPY %2
+ $f2d = COPY %2
+ CallBASR %9, $f0d, $f2d, csr_systemz_elf, implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc, implicit-def dead $f0d
+ %10:vr64bit = nofpexcept WFMADB %8, %2, %4, implicit $fpc
+ %11:vr64bit = nofpexcept WFMADB %6, %2, %10, implicit $fpc
+ VST64 %11, $noreg, 0, $noreg :: (store (s64) into `ptr null`)
+ Return
+...
+
+# *Don't* schedule the AGR low since %1 continues to be live above it.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun2:%bb.0
+# CHECK: Region is not tiny.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:gr64bit = COPY $r2d
+# CHECK-NEXT: SU(1): %1:gr64bit = COPY $r3d
+# CHECK-NEXT: SU(2): %1:gr64bit = AGR %1:gr64bit
+---
+name: fun2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r2d, $r3d
+
+ %0:gr64bit = COPY $r2d
+ %1:gr64bit = COPY $r3d
+ %1:gr64bit = AGR %1, %0, implicit-def $cc
+ %2:gr64bit = COPY undef %8:gr64bit ; Make region non-tiny
+ %3:gr64bit = COPY undef %8:gr64bit ;
+ %4:gr64bit = COPY undef %8:gr64bit ;
+ %5:gr64bit = COPY undef %8:gr64bit ;
+ %6:gr64bit = COPY undef %8:gr64bit ;
+ %7:gr64bit = COPY undef %8:gr64bit ;
+ STG %7, $noreg, 0, $noreg
+ STG %1, $noreg, 0, $noreg
+ STG %0, $noreg, 0, $noreg
+ Return
+...
+
+# *Don't* schedule the LGFR right above its user: %1 is not live.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun3:%bb.0
+# CHECK: Region is not tiny.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:gr32bit = COPY $r5l
+# CHECK-NEXT: SU(1): %1:gr32bit = COPY $r3l
+# CHECK-NEXT: SU(2): %2:gr64bit = LGFR %1:gr32bit
+# CHECK-NEXT: SU(3): %3:gr64bit = LGFR %0:gr32bit
+# CHECK-NEXT: SU(4): %3:gr64bit = MSGF %3:gr64bit(tied-def 0), $noreg, 4, $noreg ::
+# CHECK-NEXT: SU(5): %2:gr64bit = MSGF %2:gr64bit(tied-def 0), $noreg, 4, $noreg ::
+---
+name: fun3
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r3l, $r5l
+
+ %0:gr32bit = COPY $r5l
+ %1:gr32bit = COPY $r3l
+ %2:gr64bit = LGFR %1
+ %3:gr64bit = LGFR %0
+ %3:gr64bit = MSGF %3, $noreg, 4, $noreg :: (load (s32))
+ %2:gr64bit = MSGF %2, $noreg, 4, $noreg :: (load (s32))
+ %4:gr64bit = OGRK %2, %3, implicit-def dead $cc
+ %5:addr64bit = SLLG %4, $noreg, 3
+...
+
+# Schedule the VL64 low, above the first LG. It is loading a (live out) vr64,
+# which is prioritized, so even if %1 isn't live, that's ok as that is a GPR
+# register. This heruistic is guarded to not increment the scheduled latency,
+# which makes it wait until the first LG is scheduled.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun4:%bb.0
+# CHECK: Region is not tiny.
+# CHECK: Live out at bottom: %2, %3, %4
+#
+# CHECK: Queue BotQ.A: 2 3 0
+# CHECK: Scheduling SU(2) %2:vr64bit = VL64 %1:addr64bit, 0, $noreg ::
+# CHECK: Live regs was: %0, %2, %3
+#
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:addr64bit = COPY $r3d
+# CHECK-NEXT: SU(1): %1:addr64bit = COPY $r2d
+# CHECK-NEXT: SU(3): %3:addr64bit = LG %1:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(2): %2:vr64bit = VL64 %1:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(4): %4:addr64bit = LG %0:addr64bit, 0, $noreg ::
+---
+name: fun4
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r2d, $r3d
+
+ %0:addr64bit = COPY $r3d
+ %1:addr64bit = COPY $r2d
+ %2:vr64bit = VL64 %1, 0, $noreg :: (load (s64) from %ir.Arg)
+ %3:addr64bit = LG %1, 0, $noreg :: (load (s64) from `ptr null`)
+ %4:addr64bit = LG %0, 0, $noreg :: (load (s64) from %ir.Arg)
+ %5:gr64bit = COPY undef %11:gr64bit ; Make region non-tiny
+ %6:gr64bit = COPY undef %11:gr64bit ;
+ %7:gr64bit = COPY undef %11:gr64bit ;
+ %8:gr64bit = COPY undef %11:gr64bit ;
+ %9:gr64bit = COPY undef %11:gr64bit ;
+ %10:gr64bit = COPY undef %11:gr64bit ;
+ CallBASR implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc
+ STG %3, $noreg, 0, $noreg
+ VST64 %2, $noreg, 0, $noreg
+ STG %4, $noreg, 0, $noreg
+ Return
+...
+
+# Same as previous, but the VLR64 has a vr64bit (prioritized) use that will
+# become live, so it is not pulled down.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun5:%bb.0
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun5:%bb.0
+# CHECK: Region is not tiny.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:addr64bit = COPY $r3d
+# CHECK-NEXT: SU(1): %1:addr64bit = COPY $r2d
+# CHECK-NEXT: SU(2): %2:vr64bit = COPY $f0d
+# CHECK-NEXT: SU(3): %3:vr64bit = VLR64 %2:vr64bit
+# CHECK-NEXT: SU(4): %4:addr64bit = LG %1:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(5): %5:addr64bit = LG %0:addr64bit, 0, $noreg ::
+---
+name: fun5
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r2d, $r3d, $f0d
+
+ %0:addr64bit = COPY $r3d
+ %1:addr64bit = COPY $r2d
+ %2:vr64bit = COPY $f0d
+ %3:vr64bit = VLR64 %2
+ %4:addr64bit = LG %1, 0, $noreg :: (load (s64) from `ptr null`)
+ %5:addr64bit = LG %0, 0, $noreg :: (load (s64) from %ir.Arg)
+ %6:gr64bit = COPY undef %7:gr64bit ; Make region non-tiny
+ %7:gr64bit = COPY undef %7:gr64bit ;
+ %8:gr64bit = COPY undef %7:gr64bit ;
+ %9:gr64bit = COPY undef %7:gr64bit ;
+ %10:gr64bit = COPY undef %7:gr64bit ;
+ %11:gr64bit = COPY undef %7:gr64bit ;
+ %12:gr64bit = COPY undef %7:gr64bit ;
+ CallBASR implicit-def dead $r14d, implicit-def dead $cc, implicit $fpc
+ STG %4, $noreg, 0, $noreg
+ VST64 %3, $noreg, 0, $noreg
+ STG %5, $noreg, 0, $noreg
+ Return
+...
+
+# The VL64 increases scheduled latency, but there are enough remaining instructions
+# that will go above it, so it is still pulled down close to its user.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun6:%bb.0
+# CHECK: Region is not tiny.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK: SU(0): %0:vr64bit = VL64 $noreg, 0, $noreg
+# CHECK-NEXT: SU(32): VST64 %0:vr64bit, $noreg, 0, $noreg
+---
+name: fun6
+tracksRegLiveness: true
+body: |
+ bb.0:
+
+ %2:vr64bit = VL64 $noreg, 0, $noreg
+ %3:gr64bit = COPY undef %11:gr64bit ;
+ %4:gr64bit = COPY undef %11:gr64bit ;
+ %5:gr64bit = COPY undef %11:gr64bit ;
+ %6:gr64bit = COPY undef %11:gr64bit ;
+ %7:gr64bit = COPY undef %11:gr64bit ;
+ %8:gr64bit = COPY undef %11:gr64bit ;
+ %9:gr64bit = COPY undef %11:gr64bit ;
+ %11:gr64bit = COPY undef %11:gr64bit ;
+ %12:gr64bit = COPY undef %11:gr64bit ;
+ %13:gr64bit = COPY undef %11:gr64bit ;
+ %14:gr64bit = COPY undef %11:gr64bit ;
+ %15:gr64bit = COPY undef %11:gr64bit ;
+ %16:gr64bit = COPY undef %11:gr64bit ;
+ %17:gr64bit = COPY undef %11:gr64bit ;
+ %18:gr64bit = COPY undef %11:gr64bit ;
+ %19:gr64bit = COPY undef %11:gr64bit ;
+ %20:gr64bit = COPY undef %11:gr64bit ;
+ %21:gr64bit = COPY undef %11:gr64bit ;
+ %22:gr64bit = COPY undef %11:gr64bit ;
+ %23:gr64bit = COPY undef %11:gr64bit ;
+ %24:gr64bit = COPY undef %11:gr64bit ;
+ %25:gr64bit = COPY undef %11:gr64bit ;
+ %26:gr64bit = COPY undef %11:gr64bit ;
+ %27:gr64bit = COPY undef %11:gr64bit ;
+ %28:gr64bit = COPY undef %11:gr64bit ;
+ %29:gr64bit = COPY undef %11:gr64bit ;
+ %30:gr64bit = COPY undef %11:gr64bit ;
+ %31:gr64bit = COPY undef %11:gr64bit ;
+ %32:gr64bit = COPY undef %11:gr64bit ;
+ %33:gr64bit = COPY undef %11:gr64bit ;
+ %34:gr64bit = COPY undef %11:gr64bit ;
+ VST64 %2, $noreg, 0, $noreg
+ Return
+...
+
+# Same as previous, but there are not enough remaining instructions
+# that will go above it, so it stays high as it increases the scheduled latency.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun7:%bb.0
+# CHECK: Region is not tiny.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:vr64bit = VL64 $noreg, 0, $noreg
+# CHECK-NEXT: SU(1): dead %1:gr64bit = COPY undef %2:gr64bit
+# CHECK-NEXT: SU(2): dead %3:gr64bit = COPY undef %2:gr64bit
+# CHECK-NEXT: SU(3): dead %4:gr64bit = COPY undef %2:gr64bit
+# CHECK-NEXT: SU(4): dead %5:gr64bit = COPY undef %2:gr64bit
+# CHECK-NEXT: SU(5): dead %6:gr64bit = COPY undef %2:gr64bit
+# CHECK-NEXT: SU(6): dead %7:gr64bit = COPY undef %2:gr64bit
+# CHECK-NEXT: SU(7): dead %8:gr64bit = COPY undef %2:gr64bit
+# CHECK-NEXT: SU(8): dead %2:gr64bit = COPY undef %2:gr64bit
+# CHECK-NEXT: SU(9): dead %9:gr64bit = COPY undef %2:gr64bit
+# CHECK-NEXT: SU(10): VST64 %0:vr64bit, $noreg, 0, $noreg
+---
+name: fun7
+tracksRegLiveness: true
+body: |
+ bb.0:
+
+ %2:vr64bit = VL64 $noreg, 0, $noreg
+ %3:gr64bit = COPY undef %11:gr64bit ;
+ %4:gr64bit = COPY undef %11:gr64bit ;
+ %5:gr64bit = COPY undef %11:gr64bit ;
+ %6:gr64bit = COPY undef %11:gr64bit ;
+ %7:gr64bit = COPY undef %11:gr64bit ;
+ %8:gr64bit = COPY undef %11:gr64bit ;
+ %9:gr64bit = COPY undef %11:gr64bit ;
+ %11:gr64bit = COPY undef %11:gr64bit ;
+ %12:gr64bit = COPY undef %11:gr64bit ;
+ VST64 %2, $noreg, 0, $noreg
+ Return
+...
+
+# Same as fun6 but the VL64 is using %0, making it become live. It is therefore
+# not pulled down even though there are still many instructions remaining.
+# Scheduling 24 of the COPYs means 4 cycles have been scheduled, at which point
+# the VL64 can be scheduled without increasing the scheduled latency.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun8:%bb.0
+# CHECK: Region is not tiny.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:addr64bit = COPY $r2d
+# CHECK-NEXT: SU(2): dead %2:gr64bit = COPY undef %3:gr64bit
+# CHECK-NEXT: SU(3): dead %4:gr64bit = COPY undef %3:gr64bit
+# CHECK-NEXT: SU(4): dead %5:gr64bit = COPY undef %3:gr64bit
+# CHECK-NEXT: SU(5): dead %6:gr64bit = COPY undef %3:gr64bit
+# CHECK-NEXT: SU(6): dead %7:gr64bit = COPY undef %3:gr64bit
+# CHECK-NEXT: SU(7): dead %8:gr64bit = COPY undef %3:gr64bit
+# CHECK-NEXT: SU(8): dead %9:gr64bit = COPY undef %3:gr64bit
+# CHECK-NEXT: SU(9): dead %3:gr64bit = COPY undef %3:gr64bit
+# CHECK-NEXT: SU(1): %1:vr64bit = VL64 %0:addr64bit, 0, $noreg
+---
+name: fun8
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r2d
+
+ %1:addr64bit = COPY $r2d
+ %2:vr64bit = VL64 %1, 0, $noreg
+ %3:gr64bit = COPY undef %11:gr64bit ;
+ %4:gr64bit = COPY undef %11:gr64bit ;
+ %5:gr64bit = COPY undef %11:gr64bit ;
+ %6:gr64bit = COPY undef %11:gr64bit ;
+ %7:gr64bit = COPY undef %11:gr64bit ;
+ %8:gr64bit = COPY undef %11:gr64bit ;
+ %9:gr64bit = COPY undef %11:gr64bit ;
+ %11:gr64bit = COPY undef %11:gr64bit ;
+ %12:gr64bit = COPY undef %11:gr64bit ;
+ %13:gr64bit = COPY undef %11:gr64bit ;
+ %14:gr64bit = COPY undef %11:gr64bit ;
+ %15:gr64bit = COPY undef %11:gr64bit ;
+ %16:gr64bit = COPY undef %11:gr64bit ;
+ %17:gr64bit = COPY undef %11:gr64bit ;
+ %18:gr64bit = COPY undef %11:gr64bit ;
+ %19:gr64bit = COPY undef %11:gr64bit ;
+ %20:gr64bit = COPY undef %11:gr64bit ;
+ %21:gr64bit = COPY undef %11:gr64bit ;
+ %22:gr64bit = COPY undef %11:gr64bit ;
+ %23:gr64bit = COPY undef %11:gr64bit ;
+ %24:gr64bit = COPY undef %11:gr64bit ;
+ %25:gr64bit = COPY undef %11:gr64bit ;
+ %26:gr64bit = COPY undef %11:gr64bit ;
+ %27:gr64bit = COPY undef %11:gr64bit ;
+ %28:gr64bit = COPY undef %11:gr64bit ;
+ %29:gr64bit = COPY undef %11:gr64bit ;
+ %30:gr64bit = COPY undef %11:gr64bit ;
+ %31:gr64bit = COPY undef %11:gr64bit ;
+ %32:gr64bit = COPY undef %11:gr64bit ;
+ %33:gr64bit = COPY undef %11:gr64bit ;
+ %34:gr64bit = COPY undef %11:gr64bit ;
+ VST64 %2, $noreg, 0, $noreg
+ Return
+...
diff --git a/llvm/test/CodeGen/SystemZ/misched-prera-manystores-01.ll b/llvm/test/CodeGen/SystemZ/misched-prera-manystores-01.ll
new file mode 100644
index 0000000000000..39272ae5bb68e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/misched-prera-manystores-01.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 -debug-only=machine-scheduler \
+; RUN: 2>&1 | FileCheck %s --check-prefix=DBG
+; REQUIRES: asserts
+
+; The many stores should not all end up at the bottom, or spilling would result.
+; DBG: ********** MI Scheduling **********
+; DBG-NEXT: f1:%bb.0
+; DBG: *** Final schedule for %bb.0 ***
+; DBG-NEXT: SU(0): %1:addr64bit = COPY $r3d
+; DBG-NEXT: SU(1): %0:addr64bit = COPY $r2d
+; DBG-NEXT: SU(2): %33:fp32bit = LE %0:addr64bit, 60, $noreg
+; DBG-NEXT: SU(33): %33:fp32bit = nofpexcept AEBR %33:fp32bit(tied-def 0), %33:fp32bit,
+; DBG-NEXT: SU(34): STE %33:fp32bit, %1:addr64bit, 60, $noreg
+; DBG-NEXT: SU(3): %32:fp32bit = LE %0:addr64bit, 56, $noreg
+; DBG-NEXT: SU(32): %32:fp32bit = nofpexcept AEBR %32:fp32bit(tied-def 0), %32:fp32bit
+; DBG-NEXT: SU(35): STE %32:fp32bit, %1:addr64bit, 56, $noreg
+; ...
+; DBG: SU(17): %18:fp32bit = LE %0:addr64bit, 0, $noreg
+; DBG-NEXT: SU(18): %18:fp32bit = nofpexcept AEBR %18:fp32bit(tied-def 0), %18:fp32bit,
+; DBG-NEXT: SU(49): STE %18:fp32bit, %1:addr64bit, 0, $noreg
+
+define void @f1(ptr noalias %src1, ptr noalias %dest) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: %r15
+; CHECK: br %r14
+ %val = load <16 x float>, ptr %src1
+ %add = fadd <16 x float> %val, %val
+ store <16 x float> %add, ptr %dest
+ ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/misched-prera-manystores-02.mir b/llvm/test/CodeGen/SystemZ/misched-prera-manystores-02.mir
new file mode 100644
index 0000000000000..98beca886f0e3
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/misched-prera-manystores-02.mir
@@ -0,0 +1,200 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z196 -verify-machineinstrs \
+# RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler 2>&1 \
+# RUN: | FileCheck %s
+# REQUIRES: asserts
+
+--- |
+ define void @fun0(ptr noalias %src1, ptr noalias %dest) { ret void }
+ define void @fun1(ptr noalias %src1, ptr noalias %dest) { ret void }
+ define void @fun2(ptr noalias %src1, ptr noalias %dest) { ret void }
+...
+
+# Test that stores in a group are not scheduled high if the register is live
+# already. After the first 'STE %7', the second one will be scheduled above
+# it.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun0:%bb.0
+# CHECK: Region is not tiny.
+# CHECK-NEXT: Latency scheduling enabled.
+# CHECK: Has StoresGroup of 9 stores.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:addr64bit = COPY $r3d
+# CHECK-NEXT: SU(1): %1:addr64bit = COPY $r2d
+# CHECK-NEXT: SU(2): %2:fp32bit = LE %1:addr64bit, 28, $noreg ::
+# CHECK-NEXT: SU(17): %2:fp32bit = nofpexcept AEBR %2:fp32bit(tied-def 0), %2:fp32bit,
+# CHECK-NEXT: SU(18): STE %2:fp32bit, %0:addr64bit, 28, $noreg ::
+# CHECK-NEXT: SU(3): %3:fp32bit = LE %1:addr64bit, 24, $noreg ::
+# CHECK-NEXT: SU(16): %3:fp32bit = nofpexcept AEBR %3:fp32bit(tied-def 0), %3:fp32bit,
+# CHECK-NEXT: SU(19): STE %3:fp32bit, %0:addr64bit, 24, $noreg ::
+# CHECK-NEXT: SU(4): %4:fp32bit = LE %1:addr64bit, 20, $noreg ::
+# CHECK-NEXT: SU(15): %4:fp32bit = nofpexcept AEBR %4:fp32bit(tied-def 0), %4:fp32bit,
+# CHECK-NEXT: SU(20): STE %4:fp32bit, %0:addr64bit, 20, $noreg ::
+# CHECK-NEXT: SU(5): %5:fp32bit = LE %1:addr64bit, 16, $noreg ::
+# CHECK-NEXT: SU(14): %5:fp32bit = nofpexcept AEBR %5:fp32bit(tied-def 0), %5:fp32bit,
+# CHECK-NEXT: SU(21): STE %5:fp32bit, %0:addr64bit, 16, $noreg ::
+# CHECK-NEXT: SU(6): %6:fp32bit = LE %1:addr64bit, 12, $noreg ::
+# CHECK-NEXT: SU(13): %6:fp32bit = nofpexcept AEBR %6:fp32bit(tied-def 0), %6:fp32bit,
+# CHECK-NEXT: SU(22): STE %6:fp32bit, %0:addr64bit, 12, $noreg ::
+# CHECK-NEXT: SU(8): %8:fp32bit = LE %1:addr64bit, 4, $noreg ::
+# CHECK-NEXT: SU(11): %8:fp32bit = nofpexcept AEBR %8:fp32bit(tied-def 0), %8:fp32bit,
+# CHECK-NEXT: SU(24): STE %8:fp32bit, %0:addr64bit, 4, $noreg ::
+# CHECK-NEXT: SU(9): %9:fp32bit = LE %1:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(10): %9:fp32bit = nofpexcept AEBR %9:fp32bit(tied-def 0), %9:fp32bit,
+# CHECK-NEXT: SU(25): STE %9:fp32bit, %0:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(7): %7:fp32bit = LE %1:addr64bit, 8, $noreg ::
+# CHECK-NEXT: SU(12): %7:fp32bit = nofpexcept AEBR %7:fp32bit(tied-def 0), %7:fp32bit,
+# CHECK-NEXT: SU(23): STE %7:fp32bit, %0:addr64bit, 8, $noreg ::
+# CHECK-NEXT: SU(26): STE %7:fp32bit, %0:addr64bit, 0, $noreg ::
+---
+name: fun0
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $r2d, $r3d
+
+ %0:addr64bit = COPY $r3d
+ %1:addr64bit = COPY $r2d
+ %2:fp32bit = LE %1, 28, $noreg :: (load (s32) from %ir.src1 + 28, basealign 64)
+ %3:fp32bit = LE %1, 24, $noreg :: (load (s32) from %ir.src1 + 24, align 8, basealign 64)
+ %4:fp32bit = LE %1, 20, $noreg :: (load (s32) from %ir.src1 + 20, basealign 64)
+ %5:fp32bit = LE %1, 16, $noreg :: (load (s32) from %ir.src1 + 16, align 16, basealign 64)
+ %6:fp32bit = LE %1, 12, $noreg :: (load (s32) from %ir.src1 + 12, basealign 64)
+ %7:fp32bit = LE %1, 8, $noreg :: (load (s32) from %ir.src1 + 8, align 8, basealign 64)
+ %8:fp32bit = LE %1, 4, $noreg :: (load (s32) from %ir.src1 + 4, basealign 64)
+ %9:fp32bit = LE %1, 0, $noreg :: (load (s32) from %ir.src1, align 64)
+ %9:fp32bit = nofpexcept AEBR %9, %9, implicit-def dead $cc, implicit $fpc
+ %8:fp32bit = nofpexcept AEBR %8, %8, implicit-def dead $cc, implicit $fpc
+ %7:fp32bit = nofpexcept AEBR %7, %7, implicit-def dead $cc, implicit $fpc
+ %6:fp32bit = nofpexcept AEBR %6, %6, implicit-def dead $cc, implicit $fpc
+ %5:fp32bit = nofpexcept AEBR %5, %5, implicit-def dead $cc, implicit $fpc
+ %4:fp32bit = nofpexcept AEBR %4, %4, implicit-def dead $cc, implicit $fpc
+ %3:fp32bit = nofpexcept AEBR %3, %3, implicit-def dead $cc, implicit $fpc
+ %2:fp32bit = nofpexcept AEBR %2, %2, implicit-def dead $cc, implicit $fpc
+ STE %2, %0, 28, $noreg :: (store (s32) into %ir.dest + 28, basealign 64)
+ STE %3, %0, 24, $noreg :: (store (s32) into %ir.dest + 24, align 8, basealign 64)
+ STE %4, %0, 20, $noreg :: (store (s32) into %ir.dest + 20, basealign 64)
+ STE %5, %0, 16, $noreg :: (store (s32) into %ir.dest + 16, align 16, basealign 64)
+ STE %6, %0, 12, $noreg :: (store (s32) into %ir.dest + 12, basealign 64)
+ STE %7, %0, 8, $noreg :: (store (s32) into %ir.dest + 8, align 8, basealign 64)
+ STE %8, %0, 4, $noreg :: (store (s32) into %ir.dest + 4, basealign 64)
+ STE %9, %0, 0, $noreg :: (store (s32) into %ir.dest, align 64)
+ STE %7, %0, 0, $noreg :: (store (s32) into %ir.dest + 8, align 8, basealign 64)
+ Return
+...
+
+# Test that stores in a group are scheduled regardless of making an address
+# register live.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun1:%bb.0
+# CHECK: Region is not tiny.
+# CHECK-NEXT: Latency scheduling enabled.
+# CHECK: Has StoresGroup of 8 stores.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:addr64bit = COPY $r3d
+# CHECK-NEXT: SU(1): %1:addr64bit = COPY $r2d
+# CHECK-NEXT: SU(3): %3:fp32bit = LE %1:addr64bit, 28, $noreg ::
+# CHECK-NEXT: SU(18): %3:fp32bit = nofpexcept AEBR %3:fp32bit(tied-def 0), %3:fp32bit,
+# CHECK-NEXT: SU(19): STE %3:fp32bit, %0:addr64bit, 28, $noreg ::
+# CHECK-NEXT: SU(4): %4:fp32bit = LE %1:addr64bit, 24, $noreg ::
+# CHECK-NEXT: SU(17): %4:fp32bit = nofpexcept AEBR %4:fp32bit(tied-def 0), %4:fp32bit,
+# CHECK-NEXT: SU(20): STE %4:fp32bit, %0:addr64bit, 24, $noreg ::
+# CHECK-NEXT: SU(5): %5:fp32bit = LE %1:addr64bit, 20, $noreg ::
+# CHECK-NEXT: SU(16): %5:fp32bit = nofpexcept AEBR %5:fp32bit(tied-def 0), %5:fp32bit,
+# CHECK-NEXT: SU(21): STE %5:fp32bit, %0:addr64bit, 20, $noreg ::
+# CHECK-NEXT: SU(6): %6:fp32bit = LE %1:addr64bit, 16, $noreg ::
+# CHECK-NEXT: SU(15): %6:fp32bit = nofpexcept AEBR %6:fp32bit(tied-def 0), %6:fp32bit,
+# CHECK-NEXT: SU(22): STE %6:fp32bit, %0:addr64bit, 16, $noreg ::
+# CHECK-NEXT: SU(7): %7:fp32bit = LE %1:addr64bit, 12, $noreg ::
+# CHECK-NEXT: SU(14): %7:fp32bit = nofpexcept AEBR %7:fp32bit(tied-def 0), %7:fp32bit,
+# CHECK-NEXT: SU(23): STE %7:fp32bit, %0:addr64bit, 12, $noreg ::
+# CHECK-NEXT: SU(8): %8:fp32bit = LE %1:addr64bit, 8, $noreg ::
+# CHECK-NEXT: SU(13): %8:fp32bit = nofpexcept AEBR %8:fp32bit(tied-def 0), %8:fp32bit,
+# CHECK-NEXT: SU(2): %OffsReg:addr64bit = LGHI 8
+# CHECK-NEXT: SU(24): STE %8:fp32bit, %0:addr64bit, 0, %OffsReg:addr64bit ::
+# CHECK-NEXT: SU(9): %9:fp32bit = LE %1:addr64bit, 4, $noreg ::
+# CHECK-NEXT: SU(12): %9:fp32bit = nofpexcept AEBR %9:fp32bit(tied-def 0), %9:fp32bit,
+# CHECK-NEXT: SU(25): STE %9:fp32bit, %0:addr64bit, 4, $noreg ::
+# CHECK-NEXT: SU(10): %10:fp32bit = LE %1:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(11): %10:fp32bit = nofpexcept AEBR %10:fp32bit(tied-def 0), %10:fp32bit,
+# CHECK-NEXT: SU(26): STE %10:fp32bit, %0:addr64bit, 0, $noreg ::
+---
+name: fun1
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $r2d, $r3d
+
+ %0:addr64bit = COPY $r3d
+ %1:addr64bit = COPY $r2d
+ %OffsReg:addr64bit = LGHI 8
+ %3:fp32bit = LE %1, 28, $noreg :: (load (s32) from %ir.src1 + 28, basealign 64)
+ %4:fp32bit = LE %1, 24, $noreg :: (load (s32) from %ir.src1 + 24, align 8, basealign 64)
+ %5:fp32bit = LE %1, 20, $noreg :: (load (s32) from %ir.src1 + 20, basealign 64)
+ %6:fp32bit = LE %1, 16, $noreg :: (load (s32) from %ir.src1 + 16, align 16, basealign 64)
+ %7:fp32bit = LE %1, 12, $noreg :: (load (s32) from %ir.src1 + 12, basealign 64)
+ %8:fp32bit = LE %1, 8, $noreg :: (load (s32) from %ir.src1 + 8, align 8, basealign 64)
+ %9:fp32bit = LE %1, 4, $noreg :: (load (s32) from %ir.src1 + 4, basealign 64)
+ %10:fp32bit = LE %1, 0, $noreg :: (load (s32) from %ir.src1, align 64)
+ %10:fp32bit = nofpexcept AEBR %10, %10, implicit-def dead $cc, implicit $fpc
+ %9:fp32bit = nofpexcept AEBR %9, %9, implicit-def dead $cc, implicit $fpc
+ %8:fp32bit = nofpexcept AEBR %8, %8, implicit-def dead $cc, implicit $fpc
+ %7:fp32bit = nofpexcept AEBR %7, %7, implicit-def dead $cc, implicit $fpc
+ %6:fp32bit = nofpexcept AEBR %6, %6, implicit-def dead $cc, implicit $fpc
+ %5:fp32bit = nofpexcept AEBR %5, %5, implicit-def dead $cc, implicit $fpc
+ %4:fp32bit = nofpexcept AEBR %4, %4, implicit-def dead $cc, implicit $fpc
+ %3:fp32bit = nofpexcept AEBR %3, %3, implicit-def dead $cc, implicit $fpc
+ STE %3, %0, 28, $noreg :: (store (s32) into %ir.dest + 28, basealign 64)
+ STE %4, %0, 24, $noreg :: (store (s32) into %ir.dest + 24, align 8, basealign 64)
+ STE %5, %0, 20, $noreg :: (store (s32) into %ir.dest + 20, basealign 64)
+ STE %6, %0, 16, $noreg :: (store (s32) into %ir.dest + 16, align 16, basealign 64)
+ STE %7, %0, 12, $noreg :: (store (s32) into %ir.dest + 12, basealign 64)
+ STE %8, %0, 0, %OffsReg :: (store (s32) into %ir.dest + 8, align 8, basealign 64)
+ STE %9, %0, 4, $noreg :: (store (s32) into %ir.dest + 4, basealign 64)
+ STE %10, %0, 0, $noreg :: (store (s32) into %ir.dest, align 64)
+ Return
+...
+
+# Test that stores are not rescheduled in smaller groups.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun2:%bb.0
+# CHECK: Region is not tiny.
+# CHECK-NEXT: Latency scheduling enabled.
+# CHECK: No StoresGroup.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:addr64bit = COPY $r3d
+# CHECK-NEXT: SU(1): %1:addr64bit = COPY $r2d
+# CHECK-NEXT: SU(2): %2:fp32bit = LE %1:addr64bit, 12, $noreg ::
+# CHECK-NEXT: SU(3): %3:fp32bit = LE %1:addr64bit, 8, $noreg ::
+# CHECK-NEXT: SU(4): %4:fp32bit = LE %1:addr64bit, 4, $noreg ::
+# CHECK-NEXT: SU(5): %5:fp32bit = LE %1:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(6): %5:fp32bit = nofpexcept AEBR %5:fp32bit(tied-def 0), %5:fp32bit,
+# CHECK-NEXT: SU(7): %4:fp32bit = nofpexcept AEBR %4:fp32bit(tied-def 0), %4:fp32bit,
+# CHECK-NEXT: SU(8): %3:fp32bit = nofpexcept AEBR %3:fp32bit(tied-def 0), %3:fp32bit,
+# CHECK-NEXT: SU(9): %2:fp32bit = nofpexcept AEBR %2:fp32bit(tied-def 0), %2:fp32bit,
+# CHECK-NEXT: SU(10): STE %2:fp32bit, %0:addr64bit, 12, $noreg ::
+# CHECK-NEXT: SU(11): STE %3:fp32bit, %0:addr64bit, 8, $noreg ::
+# CHECK-NEXT: SU(12): STE %4:fp32bit, %0:addr64bit, 4, $noreg ::
+# CHECK-NEXT: SU(13): STE %5:fp32bit, %0:addr64bit, 0, $noreg ::
+---
+name: fun2
+tracksRegLiveness: true
+body: |
+ bb.0 (%ir-block.0):
+ liveins: $r2d, $r3d
+
+ %0:addr64bit = COPY $r3d
+ %1:addr64bit = COPY $r2d
+ %2:fp32bit = LE %1, 12, $noreg :: (load (s32) from %ir.src1 + 12, basealign 64)
+ %3:fp32bit = LE %1, 8, $noreg :: (load (s32) from %ir.src1 + 8, align 8, basealign 64)
+ %4:fp32bit = LE %1, 4, $noreg :: (load (s32) from %ir.src1 + 4, basealign 64)
+ %5:fp32bit = LE %1, 0, $noreg :: (load (s32) from %ir.src1, align 64)
+ %5:fp32bit = nofpexcept AEBR %5, %5, implicit-def dead $cc, implicit $fpc
+ %4:fp32bit = nofpexcept AEBR %4, %4, implicit-def dead $cc, implicit $fpc
+ %3:fp32bit = nofpexcept AEBR %3, %3, implicit-def dead $cc, implicit $fpc
+ %2:fp32bit = nofpexcept AEBR %2, %2, implicit-def dead $cc, implicit $fpc
+ STE %2, %0, 12, $noreg :: (store (s32) into %ir.dest + 12, basealign 64)
+ STE %3, %0, 8, $noreg :: (store (s32) into %ir.dest + 8, align 8, basealign 64)
+ STE %4, %0, 4, $noreg :: (store (s32) into %ir.dest + 4, basealign 64)
+ STE %5, %0, 0, $noreg :: (store (s32) into %ir.dest, align 64)
+ Return
+...
diff --git a/llvm/test/CodeGen/SystemZ/misched-prera-manystores-03.mir b/llvm/test/CodeGen/SystemZ/misched-prera-manystores-03.mir
new file mode 100644
index 0000000000000..839c2c1198396
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/misched-prera-manystores-03.mir
@@ -0,0 +1,154 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \
+# RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler 2>&1 \
+# RUN: | FileCheck %s
+
+--- |
+ define void @fun0(ptr noalias %Arg0, ptr noalias %Arg2, ptr noalias %Arg4) {
+ ret void
+ }
+ define void @fun1() { ret void }
+ define void @fun2(ptr %Arg0, ptr %Arg2, ptr noalias %Arg4) { ret void }
+...
+
+# Test that a group of stores is only rescheduled if they all have the same
+# opcode. There are 8 stores of same depth, but of different opcodes so
+# therefore not considered as a group.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun0:%bb.0
+# CHECK: Region is not tiny.
+# CHECK-NEXT: Latency scheduling disabled.
+# CHECK: No StoresGroup.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK: SU(10): STG %2:gr64bit, %0:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(11): STD %3:fp64bit, %1:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(12): STG %4:gr64bit, %0:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(13): STD %5:fp64bit, %1:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(14): STG %6:gr64bit, %0:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(15): STD %7:fp64bit, %1:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(16): STG %8:gr64bit, %0:addr64bit, 0, $noreg ::
+# CHECK-NEXT: SU(17): STD %9:fp64bit, %1:addr64bit, 0, $noreg ::
+---
+name: fun0
+tracksRegLiveness: true
+fixedStack:
+ - { id: 0, type: default, offset: 0, size: 8, alignment: 8, stack-id: default }
+body: |
+ bb.0:
+ liveins: $r2d, $r3d
+
+ %1:addr64bit = COPY $r3d
+ %0:addr64bit = COPY $r2d
+ %2:gr64bit = LG $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %3:fp64bit = LD $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %4:gr64bit = LG $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %5:fp64bit = LD $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %6:gr64bit = LG $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %7:fp64bit = LD $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %8:gr64bit = LG $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %9:fp64bit = LD $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ STG %2, %1, 0, $noreg :: (store (s64) into %ir.Arg2)
+ STD %3, %0, 0, $noreg :: (store (s64) into %ir.Arg0)
+ STG %4, %1, 0, $noreg :: (store (s64) into %ir.Arg2)
+ STD %5, %0, 0, $noreg :: (store (s64) into %ir.Arg0)
+ STG %6, %1, 0, $noreg :: (store (s64) into %ir.Arg2)
+ STD %7, %0, 0, $noreg :: (store (s64) into %ir.Arg0)
+ STG %8, %1, 0, $noreg :: (store (s64) into %ir.Arg2)
+ STD %9, %0, 0, $noreg :: (store (s64) into %ir.Arg0)
+ Return
+...
+
+# Test that stores that are not at the bottom (depth-wise) are not rescheduled.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun1:%bb.0
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun1:%bb.1
+# CHECK: Region is not tiny.
+# CHECK-NEXT: Latency scheduling enabled.
+# CHECK: No StoresGroup.
+# CHECK: *** Final schedule for %bb.1 ***
+# CHECK: SU(4): VST32 %2:fp32bit, %3:addr64bit, 0, $noreg
+# CHECK-NEXT: SU(5): VST32 %2:fp32bit, $noreg, 0, $noreg
+# CHECK-NEXT: SU(6): VST32 %2:fp32bit, %1:addr64bit, 0, $noreg
+# CHECK-NEXT: SU(7): VST32 %4:vr32bit, %0:addr64bit, 0, $noreg
+# CHECK-NEXT: SU(8): VST32 %2:fp32bit, %3:addr64bit, 0, $noreg
+# CHECK-NEXT: SU(9): VST32 %2:fp32bit, %1:addr64bit, 0, $noreg
+# CHECK-NEXT: SU(10): VST32 %2:fp32bit, %3:addr64bit, 0, $noreg
+# CHECK-NEXT: SU(11): VST32 %2:fp32bit, %1:addr64bit, 0, $noreg
+# CHECK-NEXT: SU(12): $r2l = COPY %7:grx32bit
+---
+name: fun1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $r2d, $f0s, $r3d, $r4d
+
+ %0:addr64bit = COPY $r4d
+ %1:addr64bit = COPY $r3d
+ %2:fp32bit = COPY $f0s
+ %3:addr64bit = COPY $r2d
+ %4:vr32bit = VL32 %1, 4, $noreg
+
+ bb.1:
+ %5:fp32bit = WFLPSB %2
+ dead %6:fp32bit = nofpexcept LTEBR %5, implicit-def $cc, implicit $fpc
+ %7:grx32bit = LHIMux 0
+ %7:grx32bit = LOCHIMux %7, 1, 15, 2, implicit killed $cc
+ VST32 %2, %3, 0, $noreg
+ VST32 %2, $noreg, 0, $noreg
+ VST32 %2, %1, 0, $noreg
+ VST32 %4, %0, 0, $noreg
+ VST32 %2, %3, 0, $noreg
+ VST32 %2, %1, 0, $noreg
+ VST32 %2, %3, 0, $noreg
+ VST32 %2, %1, 0, $noreg
+ $r2l = COPY %7
+ Return implicit $r2l
+...
+
+# Test that a group of stores is only rescheduled if they all have the same
+# depth. There are 8 stores, but of different depths so therefore not considered
+# as a group.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun2:%bb.0
+# CHECK: Region is not tiny.
+# CHECK-NEXT: Latency scheduling disabled.
+# CHECK: No StoresGroup.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK: SU(10): STG %2:gr64bit, %0:addr64bit, 0, $noreg :: (store (s64) into %ir.Arg2)
+# CHECK-NEXT: SU(11): STG %3:gr64bit, %1:addr64bit, 0, $noreg :: (store (s64) into %ir.Arg0)
+# CHECK-NEXT: SU(12): STG %4:gr64bit, %0:addr64bit, 0, $noreg :: (store (s64) into %ir.Arg2)
+# CHECK-NEXT: SU(13): STG %5:gr64bit, %1:addr64bit, 0, $noreg :: (store (s64) into %ir.Arg0)
+# CHECK-NEXT: SU(14): STG %6:gr64bit, %0:addr64bit, 0, $noreg :: (store (s64) into %ir.Arg2)
+# CHECK-NEXT: SU(15): STG %7:gr64bit, %1:addr64bit, 0, $noreg :: (store (s64) into %ir.Arg0)
+# CHECK-NEXT: SU(16): STG %8:gr64bit, %0:addr64bit, 0, $noreg :: (store (s64) into %ir.Arg2)
+# CHECK-NEXT: SU(17): STG %9:gr64bit, %1:addr64bit, 0, $noreg :: (store (s64) into %ir.Arg0)
+---
+name: fun2
+tracksRegLiveness: true
+fixedStack:
+ - { id: 0, type: default, offset: 0, size: 8, alignment: 8, stack-id: default }
+body: |
+ bb.0:
+ liveins: $r2d, $r3d
+
+ %1:addr64bit = COPY $r3d
+ %0:addr64bit = COPY $r2d
+ %2:gr64bit = LG $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %3:gr64bit = LGF $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %4:gr64bit = LG $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %5:gr64bit = LGF $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %6:gr64bit = LG $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %7:gr64bit = LGF $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %8:gr64bit = LG $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ %9:gr64bit = LGF $noreg, 0, $noreg :: (load (s64) from %ir.Arg4)
+ STG %2, %1, 0, $noreg :: (store (s64) into %ir.Arg2)
+ STG %3, %0, 0, $noreg :: (store (s64) into %ir.Arg0)
+ STG %4, %1, 0, $noreg :: (store (s64) into %ir.Arg2)
+ STG %5, %0, 0, $noreg :: (store (s64) into %ir.Arg0)
+ STG %6, %1, 0, $noreg :: (store (s64) into %ir.Arg2)
+ STG %7, %0, 0, $noreg :: (store (s64) into %ir.Arg0)
+ STG %8, %1, 0, $noreg :: (store (s64) into %ir.Arg2)
+ STG %9, %0, 0, $noreg :: (store (s64) into %ir.Arg0)
+ Return
+...
diff --git a/llvm/test/CodeGen/SystemZ/misched-prera-tinyregions.mir b/llvm/test/CodeGen/SystemZ/misched-prera-tinyregions.mir
new file mode 100644
index 0000000000000..f2d4f0bfc87a3
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/misched-prera-tinyregions.mir
@@ -0,0 +1,160 @@
+# RUN: llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z16 \
+# RUN: -verify-machineinstrs -run-pass=machine-scheduler 2>&1 \
+# RUN: -debug-only=machine-scheduler | FileCheck %s
+# REQUIRES: asserts
+
+# Don't move things around in this tiny region with just 7 instructions. It has
+# a good input order in the sense that %0 and %1 do not overlap and both have
+# COPYs involving $r2d.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun0:%bb.0
+# CHECK: Region is tiny.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:addr64bit = COPY $r2d
+# CHECK-NEXT: SU(1): STG %0:addr64bit, $noreg, 0, $noreg :: (store (s64) into
+# CHECK-NEXT: SU(2): %1:gr64bit = LA %0:addr64bit, 24, $noreg
+# CHECK-NEXT: SU(3): $r1d = LGHI 0
+# CHECK-NEXT: SU(4): $r2d = COPY %1:gr64bit
+# CHECK-NEXT: SU(5): $r3d = LGHI 0
+# CHECK-NEXT: SU(6): $r4l = LHIMux 0
+---
+name: fun0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r2d
+ %0:addr64bit = COPY $r2d
+ STG %0, $noreg, 0, $noreg :: (store (s64) into `ptr null`)
+ %1:gr64bit = LA %0, 24, $noreg
+ $r1d = LGHI 0
+ $r2d = COPY %1
+ $r3d = LGHI 0
+ $r4l = LHIMux 0
+ CallBR killed $r1d, csr_systemz_elf, implicit $r2d, implicit killed $r3d, implicit killed $r4l
+...
+
+# This function has two tiny regions, which are generally not reordered much.
+# The region in bb.1 however contains a WFADB with a long latency so it should
+# be scheduled normally and moved up.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun1:%bb.0
+# CHECK: Region is tiny.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:fp64bit = COPY $f0d
+# CHECK-NEXT: SU(1): %1:gr32bit = COPY $r2l
+# CHECK-NEXT: SU(2): %2:vr128bit = VGBM 0
+# CHECK-NEXT: SU(3): %3:fp64bit = LZDR
+# CHECK-NEXT: SU(4): %4:vr64bit = COPY %3:fp64bit
+# CHECK-NEXT: SU(5): %5:vr128bit = VGBM 0
+
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun1:%bb.1
+# CHECK: Region is not tiny.
+# CHECK: *** Final schedule for %bb.1 ***
+# CHECK-NEXT: SU(1): %7:vr128bit = COPY %5:vr128bit
+# CHECK-NEXT: SU(0): dead %6:vr64bit = COPY %4:vr64bit
+# CHECK-NEXT: SU(2): %7:vr128bit = VLEIG %7:vr128bit(tied-def 0), 0, 0
+# CHECK-NEXT: SU(5): %4:vr64bit = nofpexcept WFADB %0:fp64bit, %3:fp64bit, implicit $fpc
+# CHECK-NEXT: SU(3): %8:vr128bit = nofpexcept VFCHDB %7:vr128bit, %2:vr128bit, implicit $fpc
+# CHECK-NEXT: SU(4): %5:vr128bit = VSEL %2:vr128bit, %5:vr128bit, %8:vr128bit
+# CHECK-NEXT: SU(6): TMLMux %1:gr32bit, 1, implicit-def $cc
+---
+name: fun1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r2l, $f0d
+
+ %5:fp64bit = COPY $f0d
+ %4:gr32bit = COPY $r2l
+ %8:vr128bit = VGBM 0
+ %7:fp64bit = LZDR
+ %19:vr64bit = COPY %7
+ %20:vr128bit = VGBM 0
+
+ bb.1:
+ successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+
+ %0:vr64bit = COPY %19
+ %10:vr128bit = COPY %20
+ %10:vr128bit = VLEIG %10, 0, 0
+ %12:vr128bit = nofpexcept VFCHDB %10, %8, implicit $fpc
+ %20:vr128bit = VSEL %8, %20, %12
+ %19:vr64bit = nofpexcept WFADB %5, %7, implicit $fpc
+ TMLMux %4, 1, implicit-def $cc
+ BRC 15, 7, %bb.1, implicit killed $cc
+ J %bb.2
+
+ bb.2:
+ Return
+...
+
+# The tiny region in bb.0 has an AGF with latency 6 which would make this
+# region be handled as a big region with latency scheduling, but it however
+# has a compare-0. No reordering is done and the NG stays close to the CGHI.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun2:%bb.0
+# CHECK: Region is tiny.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): %0:gr64bit = COPY $r3d
+# CHECK-NEXT: SU(1): %1:gr64bit = COPY $r2d
+# CHECK-NEXT: SU(2): %1:gr64bit = AGF %1:gr64bit(tied-def 0), {{.*}}
+# CHECK-NEXT: SU(3): MVGHI $noreg, 0, 0 :: (store (s64) into `ptr null`)
+# CHECK-NEXT: SU(4): %2:gr64bit = LGFR %1.subreg_l32:gr64bit
+# CHECK-NEXT: SU(5): dead %3:gr64bit = NGRK %2:gr64bit, %0:gr64bit, {{.*}}
+# CHECK-NEXT: SU(6): %4:gr64bit = COPY undef %5:gr64bit
+# CHECK-NEXT: SU(7): %4:gr64bit = NG %4:gr64bit(tied-def 0), {{.*}}
+# CHECK-NEXT: SU(8): CGHI %4:gr64bit, 0, implicit-def $cc
+---
+name: fun2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ successors: %bb.2(0x30000000), %bb.1(0x50000000)
+ liveins: $r2d, $r3d
+
+ %1:gr64bit = COPY $r3d
+ %2:gr64bit = COPY $r2d
+ %2:gr64bit = AGF %2, $noreg, 0, $noreg, implicit-def dead $cc
+ MVGHI $noreg, 0, 0 :: (store (s64) into `ptr null`)
+ %4:gr64bit = LGFR %2.subreg_l32
+ %5:gr64bit = NGRK %4, %1, implicit-def dead $cc
+ %6:gr64bit = COPY undef %7:gr64bit
+ %6:gr64bit = NG %6, $noreg, 0, $noreg, implicit-def dead $cc
+ CGHI %6, 0, implicit-def $cc
+ BRC 14, 8, %bb.2, implicit killed $cc
+ J %bb.1
+
+ bb.1:
+
+ bb.2:
+ $r2d = LGHI 0
+ Return implicit $r2d
+...
+
+# A blocking long-latency (FP divide) is moved up also in a tiny region.
+# CHECK: ********** MI Scheduling **********
+# CHECK-NEXT: fun3:%bb.0
+# CHECK: Region is tiny.
+# CHECK: *** Final schedule for %bb.0 ***
+# CHECK-NEXT: SU(0): dead %0:gr64bit = COPY $r3d
+# CHECK-NEXT: SU(1): %1:gr64bit = COPY $r2d
+# CHECK-NEXT: SU(4): %2:fp64bit = COPY undef %3:fp64bit
+# CHECK-NEXT: SU(5): dead %2:fp64bit = DDB %2:fp64bit(tied-def 0), {{.*}}
+# CHECK-NEXT: SU(2): %1:gr64bit = AGRK %1:gr64bit, %1:gr64bit, {{.*}}
+# CHECK-NEXT: SU(3): dead %1:gr64bit = AGRK %1:gr64bit, %1:gr64bit, {{.*}}
+---
+name: fun3
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $r2d, $r3d
+
+ %0:gr64bit = COPY $r3d
+ %1:gr64bit = COPY $r2d
+ %1:gr64bit = AGRK %1, %1, implicit-def dead $cc
+ %1:gr64bit = AGRK %1, %1, implicit-def dead $cc
+ %2:fp64bit = COPY undef %3:fp64bit
+ %2:fp64bit = DDB %2, $noreg, 0, $noreg, implicit $fpc
+ Return
+...
diff --git a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
index 678d9a9073155..b5bbf8c38546a 100644
--- a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
+++ b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
@@ -14,12 +14,12 @@
define void @main(i16 %in) {
; CHECK-LABEL: main:
; CHECK: # %bb.0:
-; CHECK-NEXT: lhr %r2, %r2
+; CHECK-NEXT: lhr %r0, %r2
; CHECK-NEXT: larl %r1, g_151
; CHECK-NEXT: lghi %r3, 0
-; CHECK-NEXT: chi %r2, 0
-; CHECK-NEXT: lhi %r0, 1
+; CHECK-NEXT: chi %r0, 0
; CHECK-NEXT: locghile %r3, 1
+; CHECK-NEXT: lhi %r0, 1
; CHECK-NEXT: o %r0, 0(%r1)
; CHECK-NEXT: larl %r1, g_222
; CHECK-NEXT: lghi %r5, 0
diff --git a/llvm/test/CodeGen/SystemZ/rot-03.ll b/llvm/test/CodeGen/SystemZ/rot-03.ll
index 8f42439dabdf8..403ea976ebcea 100644
--- a/llvm/test/CodeGen/SystemZ/rot-03.ll
+++ b/llvm/test/CodeGen/SystemZ/rot-03.ll
@@ -8,10 +8,10 @@ define i128 @f1(i128 %val) {
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vrepib %v1, 100
-; CHECK-NEXT: vsrlb %v2, %v0, %v1
-; CHECK-NEXT: vsrl %v1, %v2, %v1
; CHECK-NEXT: vrepib %v2, 28
+; CHECK-NEXT: vsrlb %v3, %v0, %v1
; CHECK-NEXT: vslb %v0, %v0, %v2
+; CHECK-NEXT: vsrl %v1, %v3, %v1
; CHECK-NEXT: vsl %v0, %v0, %v2
; CHECK-NEXT: vo %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
@@ -47,19 +47,19 @@ define i128 @f3(i128 %val, i128 %amt) {
; CHECK-LABEL: f3:
; CHECK: # %bb.0:
; CHECK-NEXT: l %r0, 12(%r4)
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepb %v1, %v1, 15
-; CHECK-NEXT: vslb %v2, %v0, %v1
-; CHECK-NEXT: vsl %v1, %v2, %v1
-; CHECK-NEXT: vrepib %v2, 1
+; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: xilf %r0, 4294967295
-; CHECK-NEXT: vsrl %v0, %v0, %v2
; CHECK-NEXT: vlvgp %v2, %r0, %r0
+; CHECK-NEXT: vrepib %v3, 1
+; CHECK-NEXT: vrepb %v1, %v1, 15
+; CHECK-NEXT: vsrl %v3, %v0, %v3
; CHECK-NEXT: vrepb %v2, %v2, 15
-; CHECK-NEXT: vsrlb %v0, %v0, %v2
-; CHECK-NEXT: vsrl %v0, %v0, %v2
-; CHECK-NEXT: vo %v0, %v1, %v0
+; CHECK-NEXT: vslb %v0, %v0, %v1
+; CHECK-NEXT: vsrlb %v3, %v3, %v2
+; CHECK-NEXT: vsl %v0, %v0, %v1
+; CHECK-NEXT: vsrl %v1, %v3, %v2
+; CHECK-NEXT: vo %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
diff --git a/llvm/test/CodeGen/SystemZ/shift-13.ll b/llvm/test/CodeGen/SystemZ/shift-13.ll
index e214a18861172..e1a9f18082202 100644
--- a/llvm/test/CodeGen/SystemZ/shift-13.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-13.ll
@@ -99,11 +99,11 @@ define i128 @f7(i128 %a, i128 %sh) {
; CHECK: # %bb.0:
; CHECK-NEXT: lhi %r0, 63
; CHECK-NEXT: n %r0, 12(%r4)
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepb %v1, %v1, 15
-; CHECK-NEXT: vslb %v0, %v0, %v1
-; CHECK-NEXT: vsl %v0, %v0, %v1
+; CHECK-NEXT: vlvgp %v0, %r0, %r0
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vrepb %v0, %v0, 15
+; CHECK-NEXT: vslb %v1, %v1, %v0
+; CHECK-NEXT: vsl %v0, %v1, %v0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, 63
@@ -115,15 +115,15 @@ define i128 @f7(i128 %a, i128 %sh) {
define i128 @f8(i128 %a, i128 %b, i128 %sh) {
; CHECK-LABEL: f8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v1, 0(%r4), 3
-; CHECK-NEXT: vl %v2, 0(%r5), 3
-; CHECK-NEXT: vn %v1, %v2, %v1
-; CHECK-NEXT: vlgvf %r0, %v1, 3
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepb %v1, %v1, 15
-; CHECK-NEXT: vslb %v0, %v0, %v1
-; CHECK-NEXT: vsl %v0, %v0, %v1
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r5), 3
+; CHECK-NEXT: vn %v0, %v1, %v0
+; CHECK-NEXT: vlgvf %r0, %v0, 3
+; CHECK-NEXT: vlvgp %v0, %r0, %r0
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vrepb %v0, %v0, 15
+; CHECK-NEXT: vslb %v1, %v1, %v0
+; CHECK-NEXT: vsl %v0, %v1, %v0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, %b
@@ -136,16 +136,16 @@ define i128 @f9(i128 %a, i128 %sh) {
; CHECK-LABEL: f9:
; CHECK: # %bb.0:
; CHECK-NEXT: larl %r1, .LCPI8_0
-; CHECK-NEXT: vl %v1, 0(%r4), 3
-; CHECK-NEXT: vl %v2, 0(%r1), 3
-; CHECK-NEXT: vn %v1, %v1, %v2
-; CHECK-NEXT: vlgvf %r0, %v1, 3
-; CHECK-NEXT: vlvgp %v2, %r0, %r0
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepb %v2, %v2, 15
-; CHECK-NEXT: vslb %v0, %v0, %v2
-; CHECK-NEXT: vsl %v0, %v0, %v2
-; CHECK-NEXT: vaq %v0, %v1, %v0
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r1), 3
+; CHECK-NEXT: vn %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r0, %v0, 3
+; CHECK-NEXT: vlvgp %v1, %r0, %r0
+; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vrepb %v1, %v1, 15
+; CHECK-NEXT: vslb %v2, %v2, %v1
+; CHECK-NEXT: vsl %v1, %v2, %v1
+; CHECK-NEXT: vaq %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, 127
diff --git a/llvm/test/CodeGen/SystemZ/shift-14.ll b/llvm/test/CodeGen/SystemZ/shift-14.ll
index e45126043f273..b863cc2c30fe3 100644
--- a/llvm/test/CodeGen/SystemZ/shift-14.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-14.ll
@@ -99,11 +99,11 @@ define i128 @f7(i128 %a, i128 %sh) {
; CHECK: # %bb.0:
; CHECK-NEXT: lhi %r0, 63
; CHECK-NEXT: n %r0, 12(%r4)
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepb %v1, %v1, 15
-; CHECK-NEXT: vsrlb %v0, %v0, %v1
-; CHECK-NEXT: vsrl %v0, %v0, %v1
+; CHECK-NEXT: vlvgp %v0, %r0, %r0
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vrepb %v0, %v0, 15
+; CHECK-NEXT: vsrlb %v1, %v1, %v0
+; CHECK-NEXT: vsrl %v0, %v1, %v0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, 63
@@ -115,15 +115,15 @@ define i128 @f7(i128 %a, i128 %sh) {
define i128 @f8(i128 %a, i128 %b, i128 %sh) {
; CHECK-LABEL: f8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v1, 0(%r4), 3
-; CHECK-NEXT: vl %v2, 0(%r5), 3
-; CHECK-NEXT: vn %v1, %v2, %v1
-; CHECK-NEXT: vlgvf %r0, %v1, 3
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepb %v1, %v1, 15
-; CHECK-NEXT: vsrlb %v0, %v0, %v1
-; CHECK-NEXT: vsrl %v0, %v0, %v1
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r5), 3
+; CHECK-NEXT: vn %v0, %v1, %v0
+; CHECK-NEXT: vlgvf %r0, %v0, 3
+; CHECK-NEXT: vlvgp %v0, %r0, %r0
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vrepb %v0, %v0, 15
+; CHECK-NEXT: vsrlb %v1, %v1, %v0
+; CHECK-NEXT: vsrl %v0, %v1, %v0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, %b
@@ -136,16 +136,16 @@ define i128 @f9(i128 %a, i128 %sh) {
; CHECK-LABEL: f9:
; CHECK: # %bb.0:
; CHECK-NEXT: larl %r1, .LCPI8_0
-; CHECK-NEXT: vl %v1, 0(%r4), 3
-; CHECK-NEXT: vl %v2, 0(%r1), 3
-; CHECK-NEXT: vn %v1, %v1, %v2
-; CHECK-NEXT: vlgvf %r0, %v1, 3
-; CHECK-NEXT: vlvgp %v2, %r0, %r0
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepb %v2, %v2, 15
-; CHECK-NEXT: vsrlb %v0, %v0, %v2
-; CHECK-NEXT: vsrl %v0, %v0, %v2
-; CHECK-NEXT: vaq %v0, %v1, %v0
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r1), 3
+; CHECK-NEXT: vn %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r0, %v0, 3
+; CHECK-NEXT: vlvgp %v1, %r0, %r0
+; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vrepb %v1, %v1, 15
+; CHECK-NEXT: vsrlb %v2, %v2, %v1
+; CHECK-NEXT: vsrl %v1, %v2, %v1
+; CHECK-NEXT: vaq %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, 127
diff --git a/llvm/test/CodeGen/SystemZ/shift-15.ll b/llvm/test/CodeGen/SystemZ/shift-15.ll
index e21d05c4c91c8..5514a90520138 100644
--- a/llvm/test/CodeGen/SystemZ/shift-15.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-15.ll
@@ -99,11 +99,11 @@ define i128 @f7(i128 %a, i128 %sh) {
; CHECK: # %bb.0:
; CHECK-NEXT: lhi %r0, 63
; CHECK-NEXT: n %r0, 12(%r4)
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepb %v1, %v1, 15
-; CHECK-NEXT: vsrab %v0, %v0, %v1
-; CHECK-NEXT: vsra %v0, %v0, %v1
+; CHECK-NEXT: vlvgp %v0, %r0, %r0
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vrepb %v0, %v0, 15
+; CHECK-NEXT: vsrab %v1, %v1, %v0
+; CHECK-NEXT: vsra %v0, %v1, %v0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, 63
@@ -115,15 +115,15 @@ define i128 @f7(i128 %a, i128 %sh) {
define i128 @f8(i128 %a, i128 %b, i128 %sh) {
; CHECK-LABEL: f8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v1, 0(%r4), 3
-; CHECK-NEXT: vl %v2, 0(%r5), 3
-; CHECK-NEXT: vn %v1, %v2, %v1
-; CHECK-NEXT: vlgvf %r0, %v1, 3
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepb %v1, %v1, 15
-; CHECK-NEXT: vsrab %v0, %v0, %v1
-; CHECK-NEXT: vsra %v0, %v0, %v1
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r5), 3
+; CHECK-NEXT: vn %v0, %v1, %v0
+; CHECK-NEXT: vlgvf %r0, %v0, 3
+; CHECK-NEXT: vlvgp %v0, %r0, %r0
+; CHECK-NEXT: vl %v1, 0(%r3), 3
+; CHECK-NEXT: vrepb %v0, %v0, 15
+; CHECK-NEXT: vsrab %v1, %v1, %v0
+; CHECK-NEXT: vsra %v0, %v1, %v0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, %b
@@ -136,16 +136,16 @@ define i128 @f9(i128 %a, i128 %sh) {
; CHECK-LABEL: f9:
; CHECK: # %bb.0:
; CHECK-NEXT: larl %r1, .LCPI8_0
-; CHECK-NEXT: vl %v1, 0(%r4), 3
-; CHECK-NEXT: vl %v2, 0(%r1), 3
-; CHECK-NEXT: vn %v1, %v1, %v2
-; CHECK-NEXT: vlgvf %r0, %v1, 3
-; CHECK-NEXT: vlvgp %v2, %r0, %r0
-; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepb %v2, %v2, 15
-; CHECK-NEXT: vsrab %v0, %v0, %v2
-; CHECK-NEXT: vsra %v0, %v0, %v2
-; CHECK-NEXT: vaq %v0, %v1, %v0
+; CHECK-NEXT: vl %v0, 0(%r4), 3
+; CHECK-NEXT: vl %v1, 0(%r1), 3
+; CHECK-NEXT: vn %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r0, %v0, 3
+; CHECK-NEXT: vlvgp %v1, %r0, %r0
+; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vrepb %v1, %v1, 15
+; CHECK-NEXT: vsrab %v2, %v2, %v1
+; CHECK-NEXT: vsra %v1, %v2, %v1
+; CHECK-NEXT: vaq %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
%and = and i128 %sh, 127
diff --git a/llvm/test/CodeGen/SystemZ/shift-16.ll b/llvm/test/CodeGen/SystemZ/shift-16.ll
index d81c3546998be..fc291d65968f7 100644
--- a/llvm/test/CodeGen/SystemZ/shift-16.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-16.ll
@@ -15,18 +15,18 @@ define i256 @f1(i256 %a, i256 %sh) {
; CHECK-NEXT: lr %r1, %r0
; CHECK-NEXT: xilf %r1, 4294967295
; CHECK-NEXT: vlvgp %v2, %r0, %r0
-; CHECK-NEXT: vlvgp %v5, %r1, %r1
+; CHECK-NEXT: vlvgp %v3, %r1, %r1
; CHECK-NEXT: vrepib %v4, 1
-; CHECK-NEXT: vrepb %v3, %v2, 15
-; CHECK-NEXT: vsrl %v4, %v1, %v4
-; CHECK-NEXT: vrepb %v5, %v5, 15
-; CHECK-NEXT: vslb %v2, %v0, %v3
-; CHECK-NEXT: vsrlb %v4, %v4, %v5
-; CHECK-NEXT: vslb %v1, %v1, %v3
-; CHECK-NEXT: vsl %v2, %v2, %v3
-; CHECK-NEXT: vsrl %v4, %v4, %v5
-; CHECK-NEXT: vo %v2, %v2, %v4
-; CHECK-NEXT: vsl %v1, %v1, %v3
+; CHECK-NEXT: vrepb %v5, %v2, 15
+; CHECK-NEXT: vsrl %v2, %v1, %v4
+; CHECK-NEXT: vrepb %v3, %v3, 15
+; CHECK-NEXT: vslb %v4, %v0, %v5
+; CHECK-NEXT: vsrlb %v2, %v2, %v3
+; CHECK-NEXT: vslb %v1, %v1, %v5
+; CHECK-NEXT: vsl %v4, %v4, %v5
+; CHECK-NEXT: vsrl %v2, %v2, %v3
+; CHECK-NEXT: vo %v2, %v4, %v2
+; CHECK-NEXT: vsl %v1, %v1, %v5
; CHECK-NEXT: cijlh %r0, 0, .LBB0_3
; CHECK-NEXT: j .LBB0_4
; CHECK-NEXT: .LBB0_2:
@@ -59,18 +59,18 @@ define i256 @f2(i256 %a, i256 %sh) {
; CHECK-NEXT: lr %r1, %r0
; CHECK-NEXT: xilf %r1, 4294967295
; CHECK-NEXT: vlvgp %v2, %r0, %r0
-; CHECK-NEXT: vlvgp %v5, %r1, %r1
+; CHECK-NEXT: vlvgp %v3, %r1, %r1
; CHECK-NEXT: vrepib %v4, 1
-; CHECK-NEXT: vrepb %v3, %v2, 15
-; CHECK-NEXT: vsl %v4, %v1, %v4
-; CHECK-NEXT: vrepb %v5, %v5, 15
-; CHECK-NEXT: vsrlb %v2, %v0, %v3
-; CHECK-NEXT: vslb %v4, %v4, %v5
-; CHECK-NEXT: vsrlb %v1, %v1, %v3
-; CHECK-NEXT: vsrl %v2, %v2, %v3
-; CHECK-NEXT: vsl %v4, %v4, %v5
-; CHECK-NEXT: vo %v2, %v4, %v2
-; CHECK-NEXT: vsrl %v1, %v1, %v3
+; CHECK-NEXT: vrepb %v5, %v2, 15
+; CHECK-NEXT: vsl %v2, %v1, %v4
+; CHECK-NEXT: vrepb %v3, %v3, 15
+; CHECK-NEXT: vsrlb %v4, %v0, %v5
+; CHECK-NEXT: vslb %v2, %v2, %v3
+; CHECK-NEXT: vsrlb %v1, %v1, %v5
+; CHECK-NEXT: vsrl %v4, %v4, %v5
+; CHECK-NEXT: vsl %v2, %v2, %v3
+; CHECK-NEXT: vo %v2, %v2, %v4
+; CHECK-NEXT: vsrl %v1, %v1, %v5
; CHECK-NEXT: cijlh %r0, 0, .LBB1_3
; CHECK-NEXT: j .LBB1_4
; CHECK-NEXT: .LBB1_2:
@@ -97,40 +97,40 @@ define i256 @f3(i256 %a, i256 %sh) {
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 16(%r3), 3
; CHECK-NEXT: l %r0, 28(%r4)
-; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vl %v1, 0(%r3), 3
; CHECK-NEXT: clijhe %r0, 128, .LBB2_2
; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vrepb %v3, %v1, 15
-; CHECK-NEXT: vsrab %v1, %v2, %v3
-; CHECK-NEXT: vsrlb %v4, %v0, %v3
-; CHECK-NEXT: vsra %v1, %v1, %v3
; CHECK-NEXT: lr %r1, %r0
-; CHECK-NEXT: vsrl %v3, %v4, %v3
-; CHECK-NEXT: vrepib %v4, 1
; CHECK-NEXT: xilf %r1, 4294967295
-; CHECK-NEXT: vsl %v2, %v2, %v4
-; CHECK-NEXT: vlvgp %v4, %r1, %r1
-; CHECK-NEXT: vrepb %v4, %v4, 15
-; CHECK-NEXT: vslb %v2, %v2, %v4
-; CHECK-NEXT: vsl %v2, %v2, %v4
-; CHECK-NEXT: vo %v2, %v2, %v3
+; CHECK-NEXT: vlvgp %v2, %r0, %r0
+; CHECK-NEXT: vlvgp %v3, %r1, %r1
+; CHECK-NEXT: vrepib %v4, 1
+; CHECK-NEXT: vrepb %v2, %v2, 15
+; CHECK-NEXT: vsl %v4, %v1, %v4
+; CHECK-NEXT: vrepb %v3, %v3, 15
+; CHECK-NEXT: vsrlb %v5, %v0, %v2
+; CHECK-NEXT: vslb %v4, %v4, %v3
+; CHECK-NEXT: vsrab %v1, %v1, %v2
+; CHECK-NEXT: vsrl %v5, %v5, %v2
+; CHECK-NEXT: vsl %v3, %v4, %v3
+; CHECK-NEXT: vsra %v2, %v1, %v2
+; CHECK-NEXT: vo %v1, %v3, %v5
; CHECK-NEXT: cijlh %r0, 0, .LBB2_3
; CHECK-NEXT: j .LBB2_4
; CHECK-NEXT: .LBB2_2:
-; CHECK-NEXT: vrepib %v1, 127
-; CHECK-NEXT: vsrab %v3, %v2, %v1
+; CHECK-NEXT: vrepib %v2, 127
+; CHECK-NEXT: vsrab %v3, %v1, %v2
; CHECK-NEXT: ahik %r1, %r0, -128
-; CHECK-NEXT: vsra %v1, %v3, %v1
+; CHECK-NEXT: vsra %v2, %v3, %v2
; CHECK-NEXT: vlvgp %v3, %r1, %r1
; CHECK-NEXT: vrepb %v3, %v3, 15
-; CHECK-NEXT: vsrab %v2, %v2, %v3
-; CHECK-NEXT: vsra %v2, %v2, %v3
+; CHECK-NEXT: vsrab %v1, %v1, %v3
+; CHECK-NEXT: vsra %v1, %v1, %v3
; CHECK-NEXT: cije %r0, 0, .LBB2_4
; CHECK-NEXT: .LBB2_3:
-; CHECK-NEXT: vlr %v0, %v2
+; CHECK-NEXT: vlr %v0, %v1
; CHECK-NEXT: .LBB2_4:
-; CHECK-NEXT: vst %v1, 0(%r2), 3
+; CHECK-NEXT: vst %v2, 0(%r2), 3
; CHECK-NEXT: vst %v0, 16(%r2), 3
; CHECK-NEXT: br %r14
%res = ashr i256 %a, %sh
diff --git a/llvm/test/CodeGen/SystemZ/shift-17.ll b/llvm/test/CodeGen/SystemZ/shift-17.ll
index 45f4ed4d70d20..64e655de77337 100644
--- a/llvm/test/CodeGen/SystemZ/shift-17.ll
+++ b/llvm/test/CodeGen/SystemZ/shift-17.ll
@@ -32,14 +32,14 @@ define i128 @f1(i128 %a, i128 %b) {
define i128 @f2(i128 %a, i128 %b) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v1, 0(%r3), 3
; CHECK-NEXT: vl %v0, 0(%r4), 3
-; CHECK-NEXT: vrepib %v2, 5
-; CHECK-NEXT: vsl %v1, %v1, %v2
-; CHECK-NEXT: vrepib %v2, 123
-; CHECK-NEXT: vsrlb %v0, %v0, %v2
-; CHECK-NEXT: vsrl %v0, %v0, %v2
-; CHECK-NEXT: vo %v0, %v1, %v0
+; CHECK-NEXT: vrepib %v1, 123
+; CHECK-NEXT: vl %v2, 0(%r3), 3
+; CHECK-NEXT: vsrlb %v0, %v0, %v1
+; CHECK-NEXT: vrepib %v3, 5
+; CHECK-NEXT: vsl %v2, %v2, %v3
+; CHECK-NEXT: vsrl %v0, %v0, %v1
+; CHECK-NEXT: vo %v0, %v2, %v0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
;
@@ -58,14 +58,14 @@ define i128 @f2(i128 %a, i128 %b) {
define i128 @f3(i128 %a, i128 %b) {
; CHECK-LABEL: f3:
; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vl %v1, 0(%r4), 3
; CHECK-NEXT: vrepib %v2, 86
+; CHECK-NEXT: vrepib %v3, 42
; CHECK-NEXT: vsrlb %v1, %v1, %v2
-; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vslb %v0, %v0, %v3
; CHECK-NEXT: vsrl %v1, %v1, %v2
-; CHECK-NEXT: vrepib %v2, 42
-; CHECK-NEXT: vslb %v0, %v0, %v2
-; CHECK-NEXT: vsl %v0, %v0, %v2
+; CHECK-NEXT: vsl %v0, %v0, %v3
; CHECK-NEXT: vo %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
@@ -88,18 +88,18 @@ define i128 @f4(i128 %a, i128 %b, i128 %sh) {
; CHECK-LABEL: f4:
; CHECK: # %bb.0:
; CHECK-NEXT: l %r0, 12(%r5)
-; CHECK-NEXT: vlvgp %v2, %r0, %r0
-; CHECK-NEXT: vl %v1, 0(%r3), 3
-; CHECK-NEXT: vrepb %v2, %v2, 15
-; CHECK-NEXT: vslb %v1, %v1, %v2
; CHECK-NEXT: vl %v0, 0(%r4), 3
-; CHECK-NEXT: vsl %v1, %v1, %v2
-; CHECK-NEXT: vrepib %v2, 1
+; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: xilf %r0, 4294967295
-; CHECK-NEXT: vsrl %v0, %v0, %v2
; CHECK-NEXT: vlvgp %v2, %r0, %r0
+; CHECK-NEXT: vl %v3, 0(%r3), 3
+; CHECK-NEXT: vrepib %v4, 1
+; CHECK-NEXT: vrepb %v1, %v1, 15
+; CHECK-NEXT: vsrl %v0, %v0, %v4
; CHECK-NEXT: vrepb %v2, %v2, 15
+; CHECK-NEXT: vslb %v3, %v3, %v1
; CHECK-NEXT: vsrlb %v0, %v0, %v2
+; CHECK-NEXT: vsl %v1, %v3, %v1
; CHECK-NEXT: vsrl %v0, %v0, %v2
; CHECK-NEXT: vo %v0, %v1, %v0
; CHECK-NEXT: vst %v0, 0(%r2), 3
@@ -108,18 +108,18 @@ define i128 @f4(i128 %a, i128 %b, i128 %sh) {
; Z15-LABEL: f4:
; Z15: # %bb.0:
; Z15-NEXT: l %r0, 12(%r5)
-; Z15-NEXT: vlvgp %v2, %r0, %r0
-; Z15-NEXT: vl %v1, 0(%r3), 3
-; Z15-NEXT: vrepb %v2, %v2, 15
; Z15-NEXT: vl %v0, 0(%r4), 3
-; Z15-NEXT: vslb %v1, %v1, %v2
-; Z15-NEXT: vsl %v1, %v1, %v2
-; Z15-NEXT: vrepib %v2, 1
+; Z15-NEXT: vlvgp %v1, %r0, %r0
; Z15-NEXT: xilf %r0, 4294967295
-; Z15-NEXT: vsrl %v0, %v0, %v2
; Z15-NEXT: vlvgp %v2, %r0, %r0
+; Z15-NEXT: vl %v3, 0(%r3), 3
+; Z15-NEXT: vrepib %v4, 1
+; Z15-NEXT: vrepb %v1, %v1, 15
+; Z15-NEXT: vsrl %v0, %v0, %v4
; Z15-NEXT: vrepb %v2, %v2, 15
+; Z15-NEXT: vslb %v3, %v3, %v1
; Z15-NEXT: vsrlb %v0, %v0, %v2
+; Z15-NEXT: vsl %v1, %v3, %v1
; Z15-NEXT: vsrl %v0, %v0, %v2
; Z15-NEXT: vo %v0, %v1, %v0
; Z15-NEXT: vst %v0, 0(%r2), 3
@@ -153,14 +153,14 @@ define i128 @f5(i128 %a, i128 %b) {
define i128 @f6(i128 %a, i128 %b) {
; CHECK-LABEL: f6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vl %v1, 0(%r4), 3
; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vrepib %v2, 5
-; CHECK-NEXT: vsrl %v1, %v1, %v2
-; CHECK-NEXT: vrepib %v2, 123
-; CHECK-NEXT: vslb %v0, %v0, %v2
-; CHECK-NEXT: vsl %v0, %v0, %v2
-; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vrepib %v1, 123
+; CHECK-NEXT: vl %v2, 0(%r4), 3
+; CHECK-NEXT: vslb %v0, %v0, %v1
+; CHECK-NEXT: vrepib %v3, 5
+; CHECK-NEXT: vsrl %v2, %v2, %v3
+; CHECK-NEXT: vsl %v0, %v0, %v1
+; CHECK-NEXT: vo %v0, %v0, %v2
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
;
@@ -179,14 +179,14 @@ define i128 @f6(i128 %a, i128 %b) {
define i128 @f7(i128 %a, i128 %b) {
; CHECK-LABEL: f7:
; CHECK: # %bb.0:
+; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vl %v1, 0(%r4), 3
; CHECK-NEXT: vrepib %v2, 42
+; CHECK-NEXT: vrepib %v3, 86
; CHECK-NEXT: vsrlb %v1, %v1, %v2
-; CHECK-NEXT: vl %v0, 0(%r3), 3
+; CHECK-NEXT: vslb %v0, %v0, %v3
; CHECK-NEXT: vsrl %v1, %v1, %v2
-; CHECK-NEXT: vrepib %v2, 86
-; CHECK-NEXT: vslb %v0, %v0, %v2
-; CHECK-NEXT: vsl %v0, %v0, %v2
+; CHECK-NEXT: vsl %v0, %v0, %v3
; CHECK-NEXT: vo %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
@@ -209,18 +209,18 @@ define i128 @f8(i128 %a, i128 %b, i128 %sh) {
; CHECK-LABEL: f8:
; CHECK: # %bb.0:
; CHECK-NEXT: l %r0, 12(%r5)
-; CHECK-NEXT: vlvgp %v2, %r0, %r0
-; CHECK-NEXT: vl %v1, 0(%r4), 3
-; CHECK-NEXT: vrepb %v2, %v2, 15
-; CHECK-NEXT: vsrlb %v1, %v1, %v2
; CHECK-NEXT: vl %v0, 0(%r3), 3
-; CHECK-NEXT: vsrl %v1, %v1, %v2
-; CHECK-NEXT: vrepib %v2, 1
+; CHECK-NEXT: vlvgp %v1, %r0, %r0
; CHECK-NEXT: xilf %r0, 4294967295
-; CHECK-NEXT: vsl %v0, %v0, %v2
; CHECK-NEXT: vlvgp %v2, %r0, %r0
+; CHECK-NEXT: vl %v3, 0(%r4), 3
+; CHECK-NEXT: vrepib %v4, 1
+; CHECK-NEXT: vrepb %v1, %v1, 15
+; CHECK-NEXT: vsl %v0, %v0, %v4
; CHECK-NEXT: vrepb %v2, %v2, 15
+; CHECK-NEXT: vsrlb %v3, %v3, %v1
; CHECK-NEXT: vslb %v0, %v0, %v2
+; CHECK-NEXT: vsrl %v1, %v3, %v1
; CHECK-NEXT: vsl %v0, %v0, %v2
; CHECK-NEXT: vo %v0, %v0, %v1
; CHECK-NEXT: vst %v0, 0(%r2), 3
@@ -229,18 +229,18 @@ define i128 @f8(i128 %a, i128 %b, i128 %sh) {
; Z15-LABEL: f8:
; Z15: # %bb.0:
; Z15-NEXT: l %r0, 12(%r5)
-; Z15-NEXT: vlvgp %v2, %r0, %r0
-; Z15-NEXT: vl %v1, 0(%r4), 3
-; Z15-NEXT: vrepb %v2, %v2, 15
; Z15-NEXT: vl %v0, 0(%r3), 3
-; Z15-NEXT: vsrlb %v1, %v1, %v2
-; Z15-NEXT: vsrl %v1, %v1, %v2
-; Z15-NEXT: vrepib %v2, 1
+; Z15-NEXT: vlvgp %v1, %r0, %r0
; Z15-NEXT: xilf %r0, 4294967295
-; Z15-NEXT: vsl %v0, %v0, %v2
; Z15-NEXT: vlvgp %v2, %r0, %r0
+; Z15-NEXT: vl %v3, 0(%r4), 3
+; Z15-NEXT: vrepib %v4, 1
+; Z15-NEXT: vrepb %v1, %v1, 15
+; Z15-NEXT: vsl %v0, %v0, %v4
; Z15-NEXT: vrepb %v2, %v2, 15
+; Z15-NEXT: vsrlb %v3, %v3, %v1
; Z15-NEXT: vslb %v0, %v0, %v2
+; Z15-NEXT: vsrl %v1, %v3, %v1
; Z15-NEXT: vsl %v0, %v0, %v2
; Z15-NEXT: vo %v0, %v0, %v1
; Z15-NEXT: vst %v0, 0(%r2), 3
diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
index f7bbad9055afd..d70c099bc6cc7 100644
--- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
+++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
@@ -5,34 +5,34 @@
define void @fun0(<4 x i31> %src, ptr %p)
; CHECK-LABEL: fun0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vlgvf %r0, %v24, 0
-; CHECK-NEXT: vlvgp %v0, %r0, %r0
-; CHECK-NEXT: vrepib %v1, 93
; CHECK-NEXT: vlgvf %r0, %v24, 1
-; CHECK-NEXT: vslb %v0, %v0, %v1
-; CHECK-NEXT: larl %r1, .LCPI0_0
-; CHECK-NEXT: vl %v2, 0(%r1), 3
-; CHECK-NEXT: vsl %v0, %v0, %v1
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vn %v1, %v1, %v2
-; CHECK-NEXT: vrepib %v3, 62
-; CHECK-NEXT: vslb %v1, %v1, %v3
-; CHECK-NEXT: vlgvf %r0, %v24, 2
-; CHECK-NEXT: vsl %v1, %v1, %v3
-; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vlgvf %r1, %v24, 0
+; CHECK-NEXT: larl %r3, .LCPI0_0
+; CHECK-NEXT: vlgvf %r4, %v24, 2
+; CHECK-NEXT: vl %v0, 0(%r3), 3
; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vn %v1, %v1, %v2
-; CHECK-NEXT: vrepib %v3, 31
-; CHECK-NEXT: vslb %v1, %v1, %v3
+; CHECK-NEXT: vlvgp %v2, %r1, %r1
+; CHECK-NEXT: vlvgp %v3, %r4, %r4
+; CHECK-NEXT: vn %v1, %v1, %v0
+; CHECK-NEXT: vrepib %v4, 93
+; CHECK-NEXT: vrepib %v5, 62
+; CHECK-NEXT: vslb %v2, %v2, %v4
+; CHECK-NEXT: vslb %v1, %v1, %v5
+; CHECK-NEXT: vn %v3, %v3, %v0
+; CHECK-NEXT: vrepib %v6, 31
; CHECK-NEXT: vlgvf %r0, %v24, 3
-; CHECK-NEXT: vsl %v1, %v1, %v3
-; CHECK-NEXT: vo %v0, %v0, %v1
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
+; CHECK-NEXT: vslb %v3, %v3, %v6
+; CHECK-NEXT: vsl %v2, %v2, %v4
+; CHECK-NEXT: vsl %v1, %v1, %v5
+; CHECK-NEXT: vlvgp %v4, %r0, %r0
; CHECK-NEXT: larl %r1, .LCPI0_1
-; CHECK-NEXT: vn %v1, %v1, %v2
-; CHECK-NEXT: vo %v0, %v0, %v1
-; CHECK-NEXT: vl %v1, 0(%r1), 3
-; CHECK-NEXT: vn %v0, %v0, %v1
+; CHECK-NEXT: vo %v1, %v2, %v1
+; CHECK-NEXT: vsl %v2, %v3, %v6
+; CHECK-NEXT: vo %v1, %v1, %v2
+; CHECK-NEXT: vl %v2, 0(%r1), 3
+; CHECK-NEXT: vn %v0, %v4, %v0
+; CHECK-NEXT: vo %v0, %v1, %v0
+; CHECK-NEXT: vn %v0, %v0, %v2
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
{
@@ -49,34 +49,34 @@ define i16 @fun1(<16 x i1> %src)
; CHECK-NEXT: vlgvb %r0, %v24, 0
; CHECK-NEXT: vlgvb %r1, %v24, 1
; CHECK-NEXT: risblg %r0, %r0, 16, 144, 15
+; CHECK-NEXT: vlgvb %r2, %v24, 2
; CHECK-NEXT: rosbg %r0, %r1, 49, 49, 14
-; CHECK-NEXT: vlgvb %r1, %v24, 2
-; CHECK-NEXT: rosbg %r0, %r1, 50, 50, 13
; CHECK-NEXT: vlgvb %r1, %v24, 3
+; CHECK-NEXT: rosbg %r0, %r2, 50, 50, 13
+; CHECK-NEXT: vlgvb %r2, %v24, 4
; CHECK-NEXT: rosbg %r0, %r1, 51, 51, 12
-; CHECK-NEXT: vlgvb %r1, %v24, 4
-; CHECK-NEXT: rosbg %r0, %r1, 52, 52, 11
; CHECK-NEXT: vlgvb %r1, %v24, 5
+; CHECK-NEXT: rosbg %r0, %r2, 52, 52, 11
+; CHECK-NEXT: vlgvb %r2, %v24, 6
; CHECK-NEXT: rosbg %r0, %r1, 53, 53, 10
-; CHECK-NEXT: vlgvb %r1, %v24, 6
-; CHECK-NEXT: rosbg %r0, %r1, 54, 54, 9
; CHECK-NEXT: vlgvb %r1, %v24, 7
+; CHECK-NEXT: rosbg %r0, %r2, 54, 54, 9
+; CHECK-NEXT: vlgvb %r2, %v24, 8
; CHECK-NEXT: rosbg %r0, %r1, 55, 55, 8
-; CHECK-NEXT: vlgvb %r1, %v24, 8
-; CHECK-NEXT: rosbg %r0, %r1, 56, 56, 7
; CHECK-NEXT: vlgvb %r1, %v24, 9
+; CHECK-NEXT: rosbg %r0, %r2, 56, 56, 7
+; CHECK-NEXT: vlgvb %r2, %v24, 10
; CHECK-NEXT: rosbg %r0, %r1, 57, 57, 6
-; CHECK-NEXT: vlgvb %r1, %v24, 10
-; CHECK-NEXT: rosbg %r0, %r1, 58, 58, 5
; CHECK-NEXT: vlgvb %r1, %v24, 11
+; CHECK-NEXT: rosbg %r0, %r2, 58, 58, 5
+; CHECK-NEXT: vlgvb %r2, %v24, 12
; CHECK-NEXT: rosbg %r0, %r1, 59, 59, 4
-; CHECK-NEXT: vlgvb %r1, %v24, 12
-; CHECK-NEXT: rosbg %r0, %r1, 60, 60, 3
; CHECK-NEXT: vlgvb %r1, %v24, 13
+; CHECK-NEXT: rosbg %r0, %r2, 60, 60, 3
+; CHECK-NEXT: vlgvb %r2, %v24, 14
; CHECK-NEXT: rosbg %r0, %r1, 61, 61, 2
-; CHECK-NEXT: vlgvb %r1, %v24, 14
-; CHECK-NEXT: rosbg %r0, %r1, 62, 62, 1
; CHECK-NEXT: vlgvb %r1, %v24, 15
+; CHECK-NEXT: rosbg %r0, %r2, 62, 62, 1
; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 0
; CHECK-NEXT: llhr %r2, %r0
; CHECK-NEXT: aghi %r15, 168
@@ -179,9 +179,9 @@ define void @fun3(ptr %src, ptr %p)
; CHECK-NEXT: vrepib %v2, 32
; CHECK-NEXT: vslb %v0, %v0, %v2
; CHECK-NEXT: vo %v0, %v1, %v0
+; CHECK-NEXT: vsrlb %v1, %v0, %v2
; CHECK-NEXT: vstef %v0, 8(%r3), 3
-; CHECK-NEXT: vsrlb %v0, %v0, %v2
-; CHECK-NEXT: vsteg %v0, 0(%r3), 1
+; CHECK-NEXT: vsteg %v1, 0(%r3), 1
; CHECK-NEXT: br %r14
{
%tmp = load <3 x i31>, ptr %src
diff --git a/llvm/test/CodeGen/SystemZ/vec-args-04.ll b/llvm/test/CodeGen/SystemZ/vec-args-04.ll
index b1cd278992541..cc9a3d2d0cfff 100644
--- a/llvm/test/CodeGen/SystemZ/vec-args-04.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-args-04.ll
@@ -19,10 +19,9 @@ define void @foo() {
; CHECK-VEC-NEXT: aghi %r15, -192
; CHECK-VEC-NEXT: .cfi_def_cfa_offset 352
; CHECK-VEC-NEXT: larl %r1, .LCPI0_0
+; CHECK-VEC-NEXT: larl %r2, .LCPI0_1
; CHECK-VEC-NEXT: vl %v0, 0(%r1), 3
-; CHECK-VEC-NEXT: larl %r1, .LCPI0_1
-; CHECK-VEC-NEXT: vst %v0, 176(%r15), 3
-; CHECK-VEC-NEXT: vl %v0, 0(%r1), 3
+; CHECK-VEC-NEXT: vl %v1, 0(%r2), 3
; CHECK-VEC-NEXT: vrepib %v24, 1
; CHECK-VEC-NEXT: vrepib %v26, 2
; CHECK-VEC-NEXT: vrepib %v28, 3
@@ -31,7 +30,8 @@ define void @foo() {
; CHECK-VEC-NEXT: vrepib %v27, 6
; CHECK-VEC-NEXT: vrepib %v29, 7
; CHECK-VEC-NEXT: vrepib %v31, 8
-; CHECK-VEC-NEXT: vst %v0, 160(%r15), 3
+; CHECK-VEC-NEXT: vst %v0, 176(%r15), 3
+; CHECK-VEC-NEXT: vst %v1, 160(%r15), 3
; CHECK-VEC-NEXT: brasl %r14, bar at PLT
; CHECK-VEC-NEXT: lmg %r14, %r15, 304(%r15)
; CHECK-VEC-NEXT: br %r14
@@ -44,10 +44,9 @@ define void @foo() {
; CHECK-STACK-NEXT: aghi %r15, -192
; CHECK-STACK-NEXT: .cfi_def_cfa_offset 352
; CHECK-STACK-NEXT: larl %r1, .LCPI0_0
+; CHECK-STACK-NEXT: larl %r2, .LCPI0_1
; CHECK-STACK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-STACK-NEXT: larl %r1, .LCPI0_1
-; CHECK-STACK-NEXT: vst %v0, 176(%r15), 3
-; CHECK-STACK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-STACK-NEXT: vl %v1, 0(%r2), 3
; CHECK-STACK-NEXT: vrepib %v24, 1
; CHECK-STACK-NEXT: vrepib %v26, 2
; CHECK-STACK-NEXT: vrepib %v28, 3
@@ -56,7 +55,8 @@ define void @foo() {
; CHECK-STACK-NEXT: vrepib %v27, 6
; CHECK-STACK-NEXT: vrepib %v29, 7
; CHECK-STACK-NEXT: vrepib %v31, 8
-; CHECK-STACK-NEXT: vst %v0, 160(%r15), 3
+; CHECK-STACK-NEXT: vst %v0, 176(%r15), 3
+; CHECK-STACK-NEXT: vst %v1, 160(%r15), 3
; CHECK-STACK-NEXT: brasl %r14, bar at PLT
; CHECK-STACK-NEXT: lmg %r14, %r15, 304(%r15)
; CHECK-STACK-NEXT: br %r14
diff --git a/llvm/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll b/llvm/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll
index c13293bd64a16..25dd2e2df3fa2 100644
--- a/llvm/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll
@@ -63,7 +63,7 @@ define <16 x i16> @fun3(<16 x i8> %val1, <16 x i8> %val2, <16 x i16> %val3, <16
; CHECK-DAG: vceqh [[REG4:%v[0-9]+]], %v30, %v27
; CHECK-DAG: vl [[REG5:%v[0-9]+]], 176(%r15)
; CHECK-DAG: vl [[REG6:%v[0-9]+]], 160(%r15)
-; CHECK-DAG: vo [[REG7:%v[0-9]+]], %v2, [[REG4]]
+; CHECK-DAG: vo [[REG7:%v[0-9]+]], {{%v[012]+}}, [[REG4]]
; CHECK-DAG: vo [[REG8:%v[0-9]+]], [[REG2]], [[REG3]]
; CHECK-DAG: vsel %v24, %v29, [[REG6]], [[REG8]]
; CHECK-DAG: vsel %v26, %v31, [[REG5]], [[REG7]]
@@ -117,10 +117,10 @@ define <2 x i8> @fun5(<2 x i16> %val1, <2 x i16> %val2, <2 x i8> %val3, <2 x i8>
define <2 x i16> @fun6(<2 x i16> %val1, <2 x i16> %val2, <2 x i8> %val3, <2 x i8> %val4, <2 x i16> %val5, <2 x i16> %val6) {
; CHECK-LABEL: fun6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vceqb %v1, %v28, %v30
-; CHECK-NEXT: vceqh %v0, %v24, %v26
-; CHECK-NEXT: vuphb %v1, %v1
-; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vceqb [[REG0:%v[0-9]+]], %v28, %v30
+; CHECK-NEXT: vceqh [[REG1:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vuphb [[REG0]], [[REG0]]
+; CHECK-NEXT: vo %v0, [[REG1]], [[REG0]]
; CHECK-NEXT: vsel %v24, %v25, %v27, %v0
; CHECK-NEXT: br %r14
%cmp0 = icmp eq <2 x i16> %val1, %val2
@@ -133,10 +133,10 @@ define <2 x i16> @fun6(<2 x i16> %val1, <2 x i16> %val2, <2 x i8> %val3, <2 x i8
define <2 x i32> @fun7(<2 x i16> %val1, <2 x i16> %val2, <2 x i8> %val3, <2 x i8> %val4, <2 x i32> %val5, <2 x i32> %val6) {
; CHECK-LABEL: fun7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vceqb %v1, %v28, %v30
-; CHECK-NEXT: vceqh %v0, %v24, %v26
-; CHECK-NEXT: vuphb %v1, %v1
-; CHECK-NEXT: vo %v0, %v0, %v1
+; CHECK-NEXT: vceqb [[REG0:%v[0-9]+]], %v28, %v30
+; CHECK-NEXT: vceqh [[REG1:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vuphb [[REG0]], [[REG0]]
+; CHECK-NEXT: vo %v0, [[REG1]], [[REG0]]
; CHECK-NEXT: vuphh %v0, %v0
; CHECK-NEXT: vsel %v24, %v25, %v27, %v0
; CHECK-NEXT: br %r14
@@ -259,10 +259,10 @@ define <16 x i16> @fun12(<16 x i16> %val1, <16 x i16> %val2, <16 x i32> %val3, <
define <2 x i16> @fun13(<2 x i32> %val1, <2 x i32> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i16> %val5, <2 x i16> %val6) {
; CHECK-LABEL: fun13:
; CHECK: # %bb.0:
-; CHECK-NEXT: vceqg %v1, %v28, %v30
-; CHECK-NEXT: vceqf %v0, %v24, %v26
-; CHECK-NEXT: vpkg %v1, %v1, %v1
-; CHECK-NEXT: vx %v0, %v0, %v1
+; CHECK-NEXT: vceqg [[REG0:%v[0-9]+]], %v28, %v30
+; CHECK-NEXT: vceqf [[REG1:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vpkg [[REG0]], [[REG0]], [[REG0]]
+; CHECK-NEXT: vx %v0, [[REG1]], [[REG0]]
; CHECK-NEXT: vpkf %v0, %v0, %v0
; CHECK-NEXT: vsel %v24, %v25, %v27, %v0
; CHECK-NEXT: br %r14
@@ -276,10 +276,10 @@ define <2 x i16> @fun13(<2 x i32> %val1, <2 x i32> %val2, <2 x i64> %val3, <2 x
define <2 x i32> @fun14(<2 x i32> %val1, <2 x i32> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i32> %val5, <2 x i32> %val6) {
; CHECK-LABEL: fun14:
; CHECK: # %bb.0:
-; CHECK-NEXT: vceqg %v1, %v28, %v30
-; CHECK-NEXT: vceqf %v0, %v24, %v26
-; CHECK-NEXT: vpkg %v1, %v1, %v1
-; CHECK-NEXT: vx %v0, %v0, %v1
+; CHECK-NEXT: vceqg [[REG0:%v[0-9]+]], %v28, %v30
+; CHECK-NEXT: vceqf [[REG1:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vpkg [[REG0]], [[REG0]], [[REG0]]
+; CHECK-NEXT: vx %v0, [[REG1]], [[REG0]]
; CHECK-NEXT: vsel %v24, %v25, %v27, %v0
; CHECK-NEXT: br %r14
%cmp0 = icmp eq <2 x i32> %val1, %val2
@@ -324,10 +324,10 @@ define <4 x i16> @fun16(<4 x i32> %val1, <4 x i32> %val2, <4 x i16> %val3, <4 x
define <4 x i32> @fun17(<4 x i32> %val1, <4 x i32> %val2, <4 x i16> %val3, <4 x i16> %val4, <4 x i32> %val5, <4 x i32> %val6) {
; CHECK-LABEL: fun17:
; CHECK: # %bb.0:
-; CHECK-NEXT: vceqh %v1, %v28, %v30
-; CHECK-NEXT: vceqf %v0, %v24, %v26
-; CHECK-NEXT: vuphh %v1, %v1
-; CHECK-NEXT: vn %v0, %v0, %v1
+; CHECK-NEXT: vceqh [[REG0:%v[0-9]+]], %v28, %v30
+; CHECK-NEXT: vceqf [[REG1:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vuphh [[REG0]], [[REG0]]
+; CHECK-NEXT: vn %v0, [[REG1]], [[REG0]]
; CHECK-NEXT: vsel %v24, %v25, %v27, %v0
; CHECK-NEXT: br %r14
%cmp0 = icmp eq <4 x i32> %val1, %val2
@@ -340,13 +340,13 @@ define <4 x i32> @fun17(<4 x i32> %val1, <4 x i32> %val2, <4 x i16> %val3, <4 x
define <4 x i64> @fun18(<4 x i32> %val1, <4 x i32> %val2, <4 x i16> %val3, <4 x i16> %val4, <4 x i64> %val5, <4 x i64> %val6) {
; CHECK-LABEL: fun18:
; CHECK: # %bb.0:
-; CHECK-NEXT: vceqh %v1, %v28, %v30
-; CHECK-NEXT: vceqf %v0, %v24, %v26
-; CHECK-NEXT: vuphh %v1, %v1
-; CHECK-NEXT: vn %v0, %v0, %v1
-; CHECK-DAG: vuphf [[REG0:%v[0-9]+]], %v0
+; CHECK-NEXT: vceqh [[REG0:%v[0-9]+]], %v28, %v30
+; CHECK-NEXT: vceqf [[REG1:%v[0-9]+]], %v24, %v26
+; CHECK-NEXT: vuphh [[REG0]], [[REG0]]
+; CHECK-NEXT: vn %v0, [[REG1]], [[REG0]]
+; CHECK-DAG: vuphf [[REG2:%v[0-9]+]], %v0
; CHECK-DAG: vuplf [[REG1:%v[0-9]+]], %v0
-; CHECK-NEXT: vsel %v24, %v25, %v29, [[REG0]]
+; CHECK-NEXT: vsel %v24, %v25, %v29, [[REG2]]
; CHECK-NEXT: vsel %v26, %v27, %v31, [[REG1]]
; CHECK-NEXT: br %r14
%cmp0 = icmp eq <4 x i32> %val1, %val2
@@ -474,28 +474,28 @@ define <2 x float> @fun25(<2 x float> %val1, <2 x float> %val2, <2 x double> %va
; CHECK-LABEL: fun25:
; CHECK: # %bb.0:
; CHECK-NEXT: vmrlf %v0, %v26, %v26
-; CHECK-NEXT: vmrlf %v1, %v24, %v24
-; CHECK-NEXT: vldeb %v0, %v0
-; CHECK-NEXT: vldeb %v1, %v1
-; CHECK-NEXT: vfchdb %v0, %v1, %v0
; CHECK-NEXT: vmrhf %v1, %v26, %v26
-; CHECK-NEXT: vmrhf %v2, %v24, %v24
-; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vmrlf %v2, %v24, %v24
+; CHECK-NEXT: vmrhf %v3, %v24, %v24
+; CHECK-NEXT: vldeb %v0, %v0
; CHECK-NEXT: vldeb %v2, %v2
-; CHECK-NEXT: vfchdb %v1, %v2, %v1
+; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vldeb %v3, %v3
+; CHECK-NEXT: vfchdb %v0, %v2, %v0
+; CHECK-NEXT: vfchdb %v1, %v3, %v1
+; CHECK-NEXT: vfchdb %v2, %v28, %v30
; CHECK-NEXT: vpkg %v0, %v1, %v0
-; CHECK-NEXT: vfchdb %v1, %v28, %v30
-; CHECK-NEXT: vpkg %v1, %v1, %v1
+; CHECK-NEXT: vpkg %v1, %v2, %v2
; CHECK-NEXT: vo %v0, %v0, %v1
; CHECK-NEXT: vsel %v24, %v25, %v27, %v0
; CHECK-NEXT: br %r14
;
; CHECK-Z14-LABEL: fun25:
; CHECK-Z14: # %bb.0:
-; CHECK-Z14-NEXT: vfchdb %v1, %v28, %v30
-; CHECK-Z14-NEXT: vfchsb %v0, %v24, %v26
-; CHECK-Z14-NEXT: vpkg %v1, %v1, %v1
-; CHECK-Z14-NEXT: vo %v0, %v0, %v1
+; CHECK-Z14-NEXT: vfchdb [[REG0:%v[0-9]+]], %v28, %v30
+; CHECK-Z14-NEXT: vfchsb [[REG1:%v[0-9]+]], %v24, %v26
+; CHECK-Z14-NEXT: vpkg [[REG0]], [[REG0]], [[REG0]]
+; CHECK-Z14-NEXT: vo %v0, [[REG1]], [[REG0]]
; CHECK-Z14-NEXT: vsel %v24, %v25, %v27, %v0
; CHECK-Z14-NEXT: br %r14
%cmp0 = fcmp ogt <2 x float> %val1, %val2
@@ -509,15 +509,15 @@ define <2 x double> @fun26(<2 x float> %val1, <2 x float> %val2, <2 x double> %v
; CHECK-LABEL: fun26:
; CHECK: # %bb.0:
; CHECK-NEXT: vmrlf %v0, %v26, %v26
-; CHECK-NEXT: vmrlf %v1, %v24, %v24
-; CHECK-NEXT: vldeb %v0, %v0
-; CHECK-NEXT: vldeb %v1, %v1
-; CHECK-NEXT: vfchdb %v0, %v1, %v0
; CHECK-NEXT: vmrhf %v1, %v26, %v26
-; CHECK-NEXT: vmrhf %v2, %v24, %v24
-; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vmrlf %v2, %v24, %v24
+; CHECK-NEXT: vmrhf %v3, %v24, %v24
+; CHECK-NEXT: vldeb %v0, %v0
; CHECK-NEXT: vldeb %v2, %v2
-; CHECK-NEXT: vfchdb %v1, %v2, %v1
+; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vldeb %v3, %v3
+; CHECK-NEXT: vfchdb %v0, %v2, %v0
+; CHECK-NEXT: vfchdb %v1, %v3, %v1
; CHECK-NEXT: vpkg %v0, %v1, %v0
; CHECK-NEXT: vuphf %v0, %v0
; CHECK-NEXT: vfchdb %v1, %v28, %v30
@@ -581,8 +581,8 @@ define <4 x float> @fun28(<4 x float> %val1, <4 x float> %val2, <4 x float> %val
; CHECK-DAG: vmrhf [[REG17:%v[0-9]+]], %v30, %v30
; CHECK-DAG: vldeb [[REG19:%v[0-9]+]], [[REG17]]
; CHECK-DAG: vldeb [[REG20:%v[0-9]+]], [[REG8]]
-; CHECK-NEXT: vfchdb %v2, [[REG20]], [[REG19]]
-; CHECK-NEXT: vpkg [[REG21:%v[0-9]+]], %v2, [[REG16]]
+; CHECK-NEXT: vfchdb %v3, [[REG20]], [[REG19]]
+; CHECK-NEXT: vpkg [[REG21:%v[0-9]+]], %v3, [[REG16]]
; CHECK-NEXT: vx %v0, [[REG11]], [[REG21]]
; CHECK-NEXT: vsel %v24, %v25, %v27, %v0
; CHECK-NEXT: br %r14
@@ -605,27 +605,27 @@ define <4 x double> @fun29(<4 x float> %val1, <4 x float> %val2, <4 x float> %va
; CHECK-LABEL: fun29:
; CHECK: # %bb.0:
; CHECK-NEXT: vmrlf %v0, %v26, %v26
-; CHECK-NEXT: vmrlf %v1, %v24, %v24
-; CHECK-NEXT: vldeb %v0, %v0
-; CHECK-NEXT: vldeb %v1, %v1
-; CHECK-NEXT: vfchdb %v0, %v1, %v0
; CHECK-NEXT: vmrhf %v1, %v26, %v26
-; CHECK-NEXT: vmrhf %v2, %v24, %v24
-; CHECK-NEXT: vldeb %v1, %v1
-; CHECK-NEXT: vmrhf %v3, %v28, %v28
+; CHECK-NEXT: vmrlf %v2, %v24, %v24
+; CHECK-NEXT: vmrhf %v3, %v24, %v24
+; CHECK-NEXT: vmrlf %v4, %v30, %v30
+; CHECK-NEXT: vmrhf %v5, %v30, %v30
+; CHECK-NEXT: vmrlf %v6, %v28, %v28
+; CHECK-NEXT: vmrhf %v7, %v28, %v28
+; CHECK-NEXT: vldeb %v0, %v0
; CHECK-NEXT: vldeb %v2, %v2
-; CHECK-NEXT: vfchdb %v1, %v2, %v1
-; CHECK-NEXT: vpkg %v0, %v1, %v0
-; CHECK-NEXT: vmrlf %v1, %v30, %v30
-; CHECK-NEXT: vmrlf %v2, %v28, %v28
+; CHECK-NEXT: vfchdb %v0, %v2, %v0
; CHECK-NEXT: vldeb %v1, %v1
-; CHECK-NEXT: vldeb %v2, %v2
-; CHECK-NEXT: vfchdb %v1, %v2, %v1
-; CHECK-NEXT: vmrhf %v2, %v30, %v30
-; CHECK-NEXT: vldeb %v2, %v2
; CHECK-NEXT: vldeb %v3, %v3
-; CHECK-NEXT: vfchdb %v2, %v3, %v2
-; CHECK-NEXT: vpkg %v1, %v2, %v1
+; CHECK-NEXT: vfchdb %v1, %v3, %v1
+; CHECK-NEXT: vpkg %v0, %v1, %v0
+; CHECK-NEXT: vldeb %v4, %v4
+; CHECK-NEXT: vldeb %v6, %v6
+; CHECK-NEXT: vfchdb %v2, %v6, %v4
+; CHECK-NEXT: vldeb %v5, %v5
+; CHECK-NEXT: vldeb %v7, %v7
+; CHECK-NEXT: vfchdb %v3, %v7, %v5
+; CHECK-NEXT: vpkg %v1, %v3, %v2
; CHECK-NEXT: vx %v0, %v0, %v1
; CHECK-NEXT: vuplf %v1, %v0
; CHECK-NEXT: vuphf %v0, %v0
@@ -653,70 +653,70 @@ define <4 x double> @fun29(<4 x float> %val1, <4 x float> %val2, <4 x float> %va
define <8 x float> @fun30(<8 x float> %val1, <8 x float> %val2, <8 x double> %val3, <8 x double> %val4, <8 x float> %val5, <8 x float> %val6) {
; CHECK-LABEL: fun30:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmrlf %v16, %v28, %v28
-; CHECK-NEXT: vmrlf %v17, %v24, %v24
-; CHECK-NEXT: vldeb %v16, %v16
-; CHECK-NEXT: vldeb %v17, %v17
-; CHECK-NEXT: vfchdb %v16, %v17, %v16
-; CHECK-NEXT: vmrhf %v17, %v28, %v28
-; CHECK-NEXT: vmrhf %v18, %v24, %v24
-; CHECK-NEXT: vldeb %v17, %v17
-; CHECK-NEXT: vl %v4, 192(%r15)
-; CHECK-NEXT: vldeb %v18, %v18
-; CHECK-NEXT: vl %v5, 208(%r15)
-; CHECK-NEXT: vl %v6, 160(%r15)
-; CHECK-NEXT: vl %v7, 176(%r15)
-; CHECK-NEXT: vl %v0, 272(%r15)
-; CHECK-NEXT: vl %v1, 240(%r15)
-; CHECK-NEXT: vfchdb %v17, %v18, %v17
-; CHECK-NEXT: vl %v2, 256(%r15)
+; CHECK-NEXT: vmrlf %v0, %v28, %v28
+; CHECK-NEXT: vmrhf %v1, %v28, %v28
+; CHECK-NEXT: vmrlf %v2, %v24, %v24
+; CHECK-NEXT: vmrhf %v3, %v24, %v24
+; CHECK-NEXT: vmrlf %v4, %v30, %v30
+; CHECK-NEXT: vmrhf %v5, %v30, %v30
+; CHECK-NEXT: vmrlf %v6, %v26, %v26
+; CHECK-NEXT: vmrhf %v7, %v26, %v26
+; CHECK-NEXT: vldeb %v0, %v0
+; CHECK-NEXT: vldeb %v2, %v2
+; CHECK-NEXT: vfchdb %v0, %v2, %v0
+; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vldeb %v3, %v3
+; CHECK-NEXT: vfchdb %v1, %v3, %v1
+; CHECK-NEXT: vpkg %v0, %v1, %v0
+; CHECK-NEXT: vldeb %v4, %v4
+; CHECK-NEXT: vldeb %v6, %v6
+; CHECK-NEXT: vfchdb %v2, %v6, %v4
+; CHECK-NEXT: vl %v4, 176(%r15)
+; CHECK-NEXT: vl %v6, 208(%r15)
+; CHECK-NEXT: vfchdb %v4, %v27, %v4
+; CHECK-NEXT: vfchdb %v6, %v31, %v6
+; CHECK-DAG: vldeb %v5, %v5
+; CHECK-NEXT: vldeb %v7, %v7
+; CHECK-NEXT: vfchdb %v3, %v7, %v5
+; CHECK-NEXT: vl %v5, 160(%r15)
+; CHECK-NEXT: vl %v7, 192(%r15)
+; CHECK-NEXT: vfchdb %v5, %v25, %v5
+; CHECK-NEXT: vfchdb %v7, %v29, %v7
+; CHECK-NEXT: vpkg %v1, %v3, %v2
+; CHECK-NEXT: vpkg %v2, %v5, %v4
+; CHECK-NEXT: vpkg %v3, %v7, %v6
+; CHECK-NEXT: vn %v1, %v1, %v3
; CHECK-NEXT: vl %v3, 224(%r15)
-; CHECK-NEXT: vpkg %v16, %v17, %v16
-; CHECK-NEXT: vmrlf %v17, %v30, %v30
-; CHECK-NEXT: vmrlf %v18, %v26, %v26
-; CHECK-NEXT: vmrhf %v19, %v26, %v26
-; CHECK-NEXT: vfchdb %v7, %v27, %v7
-; CHECK-NEXT: vfchdb %v6, %v25, %v6
-; CHECK-NEXT: vfchdb %v5, %v31, %v5
-; CHECK-NEXT: vfchdb %v4, %v29, %v4
-; CHECK-NEXT: vpkg %v6, %v6, %v7
-; CHECK-NEXT: vpkg %v4, %v4, %v5
-; CHECK-NEXT: vn %v5, %v16, %v6
-; CHECK-DAG: vsel %v24, %v3, %v2, %v5
-; CHECK-DAG: vldeb %v17, %v17
-; CHECK-NEXT: vldeb %v18, %v18
-; CHECK-NEXT: vfchdb %v17, %v18, %v17
-; CHECK-NEXT: vmrhf %v18, %v30, %v30
-; CHECK-NEXT: vldeb %v18, %v18
-; CHECK-NEXT: vldeb %v19, %v19
-; CHECK-NEXT: vfchdb %v18, %v19, %v18
-; CHECK-NEXT: vpkg %v17, %v18, %v17
-; CHECK-NEXT: vn %v4, %v17, %v4
-; CHECK-NEXT: vsel %v26, %v1, %v0, %v4
+; CHECK-NEXT: vn %v0, %v0, %v2
+; CHECK-NEXT: vl %v2, 256(%r15)
+; CHECK-DAG: vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT: vl %v0, 272(%r15)
+; CHECK-NEXT: vl %v2, 240(%r15)
+; CHECK-NEXT: vsel %v26, %v2, %v0, %v1
; CHECK-NEXT: br %r14
;
; CHECK-Z14-LABEL: fun30:
; CHECK-Z14: # %bb.0:
-; CHECK-Z14-NEXT: vl %v4, 192(%r15)
-; CHECK-Z14-NEXT: vl %v5, 208(%r15)
-; CHECK-Z14-NEXT: vl %v6, 160(%r15)
-; CHECK-Z14-NEXT: vl %v7, 176(%r15)
-; CHECK-Z14-NEXT: vfchdb %v7, %v27, %v7
-; CHECK-Z14-NEXT: vfchdb %v6, %v25, %v6
-; CHECK-Z14-NEXT: vfchdb %v5, %v31, %v5
-; CHECK-Z14-NEXT: vfchdb %v4, %v29, %v4
-; CHECK-Z14-DAG: vfchsb %v16, %v24, %v28
-; CHECK-Z14-DAG: vfchsb %v17, %v26, %v30
-; CHECK-Z14-DAG: vpkg %v6, %v6, %v7
-; CHECK-Z14-DAG: vpkg %v4, %v4, %v5
-; CHECK-Z14-DAG: vl %v0, 272(%r15)
-; CHECK-Z14-DAG: vl %v1, 240(%r15)
-; CHECK-Z14-DAG: vl %v2, 256(%r15)
-; CHECK-Z14-DAG: vl %v3, 224(%r15)
-; CHECK-Z14-NEXT: vn %v4, %v17, %v4
-; CHECK-Z14-NEXT: vn %v5, %v16, %v6
-; CHECK-Z14-NEXT: vsel %v24, %v3, %v2, %v5
-; CHECK-Z14-NEXT: vsel %v26, %v1, %v0, %v4
+; CHECK-Z14-NEXT: vl %v0, 192(%r15)
+; CHECK-Z14-NEXT: vl %v1, 208(%r15)
+; CHECK-Z14-NEXT: vl %v2, 160(%r15)
+; CHECK-Z14-NEXT: vl %v3, 176(%r15)
+; CHECK-Z14-NEXT: vfchdb %v3, %v27, %v3
+; CHECK-Z14-NEXT: vfchdb %v2, %v25, %v2
+; CHECK-Z14-NEXT: vfchdb %v1, %v31, %v1
+; CHECK-Z14-NEXT: vfchdb %v0, %v29, %v0
+; CHECK-Z14-DAG: vfchsb %v4, %v24, %v28
+; CHECK-Z14-DAG: vfchsb %v5, %v26, %v30
+; CHECK-Z14-DAG: vpkg %v2, %v2, %v3
+; CHECK-Z14-DAG: vpkg %v0, %v0, %v1
+; CHECK-Z14-DAG: vl %v3, 272(%r15)
+; CHECK-Z14-DAG: vl %v6, 240(%r15)
+; CHECK-Z14-DAG: vl %v7, 256(%r15)
+; CHECK-Z14-DAG: vl %v16, 224(%r15)
+; CHECK-Z14-NEXT: vn %v0, %v5, %v0
+; CHECK-Z14-NEXT: vn %v1, %v4, %v2
+; CHECK-Z14-NEXT: vsel %v24, %v16, %v7, %v1
+; CHECK-Z14-NEXT: vsel %v26, %v6, %v3, %v0
; CHECK-Z14-NEXT: br %r14
%cmp0 = fcmp ogt <8 x float> %val1, %val2
%cmp1 = fcmp ogt <8 x double> %val3, %val4
@@ -759,21 +759,21 @@ define <2 x double> @fun32(<2 x double> %val1, <2 x double> %val2, <2 x double>
define <4 x float> @fun33(<4 x double> %val1, <4 x double> %val2, <4 x float> %val3, <4 x float> %val4, <4 x float> %val5, <4 x float> %val6) {
; CHECK-LABEL: fun33:
; CHECK: # %bb.0:
-; CHECK-NEXT: vfchdb %v0, %v26, %v30
-; CHECK-NEXT: vfchdb %v1, %v24, %v28
-; CHECK-NEXT: vpkg %v0, %v1, %v0
-; CHECK-NEXT: vmrlf %v1, %v27, %v27
+; CHECK-NEXT: vmrlf %v0, %v27, %v27
+; CHECK-NEXT: vmrhf %v1, %v27, %v27
; CHECK-NEXT: vmrlf %v2, %v25, %v25
-; CHECK-NEXT: vldeb %v1, %v1
-; CHECK-NEXT: vldeb %v2, %v2
-; CHECK-NEXT: vfchdb %v1, %v2, %v1
-; CHECK-NEXT: vmrhf %v2, %v27, %v27
; CHECK-NEXT: vmrhf %v3, %v25, %v25
+; CHECK-NEXT: vldeb %v0, %v0
; CHECK-NEXT: vldeb %v2, %v2
+; CHECK-NEXT: vldeb %v1, %v1
; CHECK-NEXT: vldeb %v3, %v3
-; CHECK-NEXT: vfchdb %v2, %v3, %v2
-; CHECK-NEXT: vpkg %v1, %v2, %v1
-; CHECK-NEXT: vn %v0, %v0, %v1
+; CHECK-NEXT: vfchdb %v4, %v26, %v30
+; CHECK-NEXT: vfchdb %v5, %v24, %v28
+; CHECK-NEXT: vfchdb %v0, %v2, %v0
+; CHECK-NEXT: vfchdb %v1, %v3, %v1
+; CHECK-NEXT: vpkg %v2, %v5, %v4
+; CHECK-NEXT: vpkg %v0, %v1, %v0
+; CHECK-NEXT: vn %v0, %v2, %v0
; CHECK-NEXT: vsel %v24, %v29, %v31, %v0
; CHECK-NEXT: br %r14
;
@@ -797,16 +797,16 @@ define <4 x double> @fun34(<4 x double> %val1, <4 x double> %val2, <4 x float> %
; CHECK-LABEL: fun34:
; CHECK: # %bb.0:
; CHECK-NEXT: vmrlf [[REG0:%v[0-9]+]], %v27, %v27
+; CHECK-NEXT: vmrhf [[REG5:%v[0-9]+]], %v27, %v27
; CHECK-NEXT: vmrlf [[REG1:%v[0-9]+]], %v25, %v25
+; CHECK-NEXT: vmrhf [[REG6:%v[0-9]+]], %v25, %v25
; CHECK-NEXT: vldeb [[REG2:%v[0-9]+]], [[REG0]]
; CHECK-NEXT: vldeb [[REG3:%v[0-9]+]], [[REG1]]
-; CHECK-NEXT: vfchdb [[REG4:%v[0-9]+]], [[REG3]], [[REG2]]
-; CHECK-NEXT: vmrhf [[REG5:%v[0-9]+]], %v27, %v27
-; CHECK-NEXT: vmrhf [[REG6:%v[0-9]+]], %v25, %v25
; CHECK-DAG: vldeb [[REG7:%v[0-9]+]], [[REG5]]
-; CHECK-DAG: vl [[REG8:%v[0-9]+]], 176(%r15)
; CHECK-DAG: vldeb [[REG9:%v[0-9]+]], [[REG6]]
+; CHECK-DAG: vl [[REG8:%v[0-9]+]], 176(%r15)
; CHECK-DAG: vl [[REG10:%v[0-9]+]], 160(%r15)
+; CHECK-DAG: vfchdb [[REG4:%v[0-9]+]], [[REG3]], [[REG2]]
; CHECK-DAG: vfchdb [[REG11:%v[0-9]+]], [[REG9]], [[REG7]]
; CHECK-DAG: vpkg [[REG12:%v[0-9]+]], [[REG11]], [[REG4]]
; CHECK-DAG: vuphf [[REG13:%v[0-9]+]], [[REG12]]
@@ -821,17 +821,17 @@ define <4 x double> @fun34(<4 x double> %val1, <4 x double> %val2, <4 x float> %
;
; CHECK-Z14-LABEL: fun34:
; CHECK-Z14: # %bb.0:
-; CHECK-Z14-NEXT: vfchsb %v4, %v25, %v27
-; CHECK-Z14-NEXT: vl %v0, 176(%r15)
-; CHECK-Z14-NEXT: vl %v1, 160(%r15)
-; CHECK-Z14-NEXT: vfchdb %v2, %v24, %v28
-; CHECK-Z14-NEXT: vfchdb %v3, %v26, %v30
-; CHECK-Z14-NEXT: vuphf %v5, %v4
-; CHECK-Z14-NEXT: vuplf %v4, %v4
-; CHECK-Z14-NEXT: vn %v3, %v3, %v4
-; CHECK-Z14-NEXT: vn %v2, %v2, %v5
-; CHECK-Z14-NEXT: vsel %v24, %v29, %v1, %v2
-; CHECK-Z14-NEXT: vsel %v26, %v31, %v0, %v3
+; CHECK-Z14-NEXT: vfchsb %v0, %v25, %v27
+; CHECK-Z14-NEXT: vfchdb %v1, %v24, %v28
+; CHECK-Z14-NEXT: vfchdb %v2, %v26, %v30
+; CHECK-Z14-NEXT: vl %v3, 176(%r15)
+; CHECK-Z14-NEXT: vl %v4, 160(%r15)
+; CHECK-Z14-DAG: vuphf %v5, %v0
+; CHECK-Z14-DAG: vuplf %v0, %v0
+; CHECK-Z14-NEXT: vn %v0, %v2, %v0
+; CHECK-Z14-NEXT: vn %v1, %v1, %v5
+; CHECK-Z14-NEXT: vsel %v24, %v29, %v4, %v1
+; CHECK-Z14-NEXT: vsel %v26, %v31, %v3, %v0
; CHECK-Z14-NEXT: br %r14
%cmp0 = fcmp ogt <4 x double> %val1, %val2
%cmp1 = fcmp ogt <4 x float> %val3, %val4
diff --git a/llvm/test/CodeGen/SystemZ/vec-cmpsel.ll b/llvm/test/CodeGen/SystemZ/vec-cmpsel-01.ll
similarity index 78%
rename from llvm/test/CodeGen/SystemZ/vec-cmpsel.ll
rename to llvm/test/CodeGen/SystemZ/vec-cmpsel-01.ll
index f93ecc348af65..4fa9c5bf37ef1 100644
--- a/llvm/test/CodeGen/SystemZ/vec-cmpsel.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-cmpsel-01.ll
@@ -1,8 +1,8 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; Test that vector compare / select combinations do not produce any
; unnecessary pack /unpack / shift instructions.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s -check-prefix=CHECK-Z14
define <2 x i8> @fun0(<2 x i8> %val1, <2 x i8> %val2, <2 x i8> %val3, <2 x i8> %val4) {
; CHECK-LABEL: fun0:
@@ -42,10 +42,10 @@ define <16 x i16> @fun3(<16 x i8> %val1, <16 x i8> %val2, <16 x i16> %val3, <16
; CHECK-LABEL: fun3:
; CHECK: # %bb.0:
; CHECK-NEXT: vceqb %v0, %v24, %v26
-; CHECK-DAG: vuphb [[REG0:%v[0-9]+]], %v0
-; CHECK-DAG: vuplb [[REG1:%v[0-9]+]], %v0
-; CHECK-NEXT: vsel %v24, %v28, %v25, [[REG0]]
-; CHECK-NEXT: vsel %v26, %v30, %v27, [[REG1]]
+; CHECK-NEXT: vuplb %v1, %v0
+; CHECK-NEXT: vuphb %v0, %v0
+; CHECK-NEXT: vsel %v24, %v28, %v25, %v0
+; CHECK-NEXT: vsel %v26, %v30, %v27, %v1
; CHECK-NEXT: br %r14
%cmp = icmp eq <16 x i8> %val1, %val2
%sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
@@ -55,10 +55,10 @@ define <16 x i16> @fun3(<16 x i8> %val1, <16 x i8> %val2, <16 x i16> %val3, <16
define <32 x i8> @fun4(<32 x i8> %val1, <32 x i8> %val2, <32 x i8> %val3, <32 x i8> %val4) {
; CHECK-LABEL: fun4:
; CHECK: # %bb.0:
-; CHECK-DAG: vceqb [[REG0:%v[0-9]+]], %v26, %v30
-; CHECK-DAG: vceqb [[REG1:%v[0-9]+]], %v24, %v28
-; CHECK-DAG: vsel %v24, %v25, %v29, [[REG1]]
-; CHECK-DAG: vsel %v26, %v27, %v31, [[REG0]]
+; CHECK-NEXT: vceqb %v0, %v26, %v30
+; CHECK-NEXT: vceqb %v1, %v24, %v28
+; CHECK-NEXT: vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT: vsel %v26, %v27, %v31, %v0
; CHECK-NEXT: br %r14
%cmp = icmp eq <32 x i8> %val1, %val2
%sel = select <32 x i1> %cmp, <32 x i8> %val3, <32 x i8> %val4
@@ -127,10 +127,10 @@ define <8 x i32> @fun10(<8 x i16> %val1, <8 x i16> %val2, <8 x i32> %val3, <8 x
; CHECK-LABEL: fun10:
; CHECK: # %bb.0:
; CHECK-NEXT: vceqh %v0, %v24, %v26
-; CHECK-DAG: vuphh [[REG0:%v[0-9]+]], %v0
-; CHECK-DAG: vuplhw [[REG1:%v[0-9]+]], %v0
-; CHECK-NEXT: vsel %v24, %v28, %v25, [[REG0]]
-; CHECK-NEXT: vsel %v26, %v30, %v27, [[REG1]]
+; CHECK-NEXT: vuplhw %v1, %v0
+; CHECK-NEXT: vuphh %v0, %v0
+; CHECK-NEXT: vsel %v24, %v28, %v25, %v0
+; CHECK-NEXT: vsel %v26, %v30, %v27, %v1
; CHECK-NEXT: br %r14
%cmp = icmp eq <8 x i16> %val1, %val2
%sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
@@ -153,10 +153,10 @@ define <16 x i8> @fun11(<16 x i16> %val1, <16 x i16> %val2, <16 x i8> %val3, <16
define <16 x i16> @fun12(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4) {
; CHECK-LABEL: fun12:
; CHECK: # %bb.0:
-; CHECK-DAG: vceqh [[REG0:%v[0-9]+]], %v26, %v30
-; CHECK-DAG: vceqh [[REG1:%v[0-9]+]], %v24, %v28
-; CHECK-DAG: vsel %v24, %v25, %v29, [[REG1]]
-; CHECK-DAG: vsel %v26, %v27, %v31, [[REG0]]
+; CHECK-NEXT: vceqh %v0, %v26, %v30
+; CHECK-NEXT: vceqh %v1, %v24, %v28
+; CHECK-NEXT: vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT: vsel %v26, %v27, %v31, %v0
; CHECK-NEXT: br %r14
%cmp = icmp eq <16 x i16> %val1, %val2
%sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
@@ -225,10 +225,10 @@ define <4 x i64> @fun18(<4 x i32> %val1, <4 x i32> %val2, <4 x i64> %val3, <4 x
; CHECK-LABEL: fun18:
; CHECK: # %bb.0:
; CHECK-NEXT: vceqf %v0, %v24, %v26
-; CHECK-DAG: vuphf [[REG0:%v[0-9]+]], %v0
-; CHECK-DAG: vuplf [[REG1]], %v0
-; CHECK-NEXT: vsel %v24, %v28, %v25, [[REG0]]
-; CHECK-NEXT: vsel %v26, %v30, %v27, [[REG1]]
+; CHECK-NEXT: vuplf %v1, %v0
+; CHECK-NEXT: vuphf %v0, %v0
+; CHECK-NEXT: vsel %v24, %v28, %v25, %v0
+; CHECK-NEXT: vsel %v26, %v30, %v27, %v1
; CHECK-NEXT: br %r14
%cmp = icmp eq <4 x i32> %val1, %val2
%sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
@@ -251,10 +251,10 @@ define <8 x i16> @fun19(<8 x i32> %val1, <8 x i32> %val2, <8 x i16> %val3, <8 x
define <8 x i32> @fun20(<8 x i32> %val1, <8 x i32> %val2, <8 x i32> %val3, <8 x i32> %val4) {
; CHECK-LABEL: fun20:
; CHECK: # %bb.0:
-; CHECK-DAG: vceqf [[REG0:%v[0-9]+]], %v26, %v30
-; CHECK-DAG: vceqf [[REG1:%v[0-9]+]], %v24, %v28
-; CHECK-DAG: vsel %v24, %v25, %v29, [[REG1]]
-; CHECK-DAG: vsel %v26, %v27, %v31, [[REG0]]
+; CHECK-NEXT: vceqf %v0, %v26, %v30
+; CHECK-NEXT: vceqf %v1, %v24, %v28
+; CHECK-NEXT: vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT: vsel %v26, %v27, %v31, %v0
; CHECK-NEXT: br %r14
%cmp = icmp eq <8 x i32> %val1, %val2
%sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
@@ -300,10 +300,10 @@ define <4 x i32> @fun23(<4 x i64> %val1, <4 x i64> %val2, <4 x i32> %val3, <4 x
define <4 x i64> @fun24(<4 x i64> %val1, <4 x i64> %val2, <4 x i64> %val3, <4 x i64> %val4) {
; CHECK-LABEL: fun24:
; CHECK: # %bb.0:
-; CHECK-DAG: vceqg [[REG0:%v[0-9]+]], %v26, %v30
-; CHECK-DAG: vceqg [[REG1:%v[0-9]+]], %v24, %v28
-; CHECK-DAG: vsel %v24, %v25, %v29, [[REG1]]
-; CHECK-DAG: vsel %v26, %v27, %v31, [[REG0]]
+; CHECK-NEXT: vceqg %v0, %v26, %v30
+; CHECK-NEXT: vceqg %v1, %v24, %v28
+; CHECK-NEXT: vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT: vsel %v26, %v27, %v31, %v0
; CHECK-NEXT: br %r14
%cmp = icmp eq <4 x i64> %val1, %val2
%sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
@@ -314,25 +314,18 @@ define <2 x float> @fun25(<2 x float> %val1, <2 x float> %val2, <2 x float> %val
; CHECK-LABEL: fun25:
; CHECK: # %bb.0:
; CHECK-NEXT: vmrlf %v0, %v26, %v26
-; CHECK-NEXT: vmrlf %v1, %v24, %v24
-; CHECK-NEXT: vldeb %v0, %v0
-; CHECK-NEXT: vldeb %v1, %v1
-; CHECK-NEXT: vfchdb %v0, %v1, %v0
; CHECK-NEXT: vmrhf %v1, %v26, %v26
-; CHECK-NEXT: vmrhf %v2, %v24, %v24
-; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vmrlf %v2, %v24, %v24
+; CHECK-NEXT: vmrhf %v3, %v24, %v24
+; CHECK-NEXT: vldeb %v0, %v0
; CHECK-NEXT: vldeb %v2, %v2
-; CHECK-NEXT: vfchdb %v1, %v2, %v1
+; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vldeb %v3, %v3
+; CHECK-NEXT: vfchdb %v0, %v2, %v0
+; CHECK-NEXT: vfchdb %v1, %v3, %v1
; CHECK-NEXT: vpkg %v0, %v1, %v0
; CHECK-NEXT: vsel %v24, %v28, %v30, %v0
; CHECK-NEXT: br %r14
-
-; CHECK-Z14-LABEL: fun25:
-; CHECK-Z14: # %bb.0:
-; CHECK-Z14-NEXT: vfchsb %v0, %v24, %v26
-; CHECK-Z14-NEXT: vsel %v24, %v28, %v30, %v0
-; CHECK-Z14-NEXT: br %r14
-
%cmp = fcmp ogt <2 x float> %val1, %val2
%sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
ret <2 x float> %sel
@@ -342,27 +335,19 @@ define <2 x double> @fun26(<2 x float> %val1, <2 x float> %val2, <2 x double> %v
; CHECK-LABEL: fun26:
; CHECK: # %bb.0:
; CHECK-NEXT: vmrlf %v0, %v26, %v26
-; CHECK-NEXT: vmrlf %v1, %v24, %v24
-; CHECK-NEXT: vldeb %v0, %v0
-; CHECK-NEXT: vldeb %v1, %v1
-; CHECK-NEXT: vfchdb %v0, %v1, %v0
; CHECK-NEXT: vmrhf %v1, %v26, %v26
-; CHECK-NEXT: vmrhf %v2, %v24, %v24
-; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vmrlf %v2, %v24, %v24
+; CHECK-NEXT: vmrhf %v3, %v24, %v24
+; CHECK-NEXT: vldeb %v0, %v0
; CHECK-NEXT: vldeb %v2, %v2
-; CHECK-NEXT: vfchdb %v1, %v2, %v1
+; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vldeb %v3, %v3
+; CHECK-NEXT: vfchdb %v0, %v2, %v0
+; CHECK-NEXT: vfchdb %v1, %v3, %v1
; CHECK-NEXT: vpkg %v0, %v1, %v0
; CHECK-NEXT: vuphf %v0, %v0
; CHECK-NEXT: vsel %v24, %v28, %v30, %v0
; CHECK-NEXT: br %r14
-
-; CHECK-Z14-LABEL: fun26:
-; CHECK-Z14: # %bb.0:
-; CHECK-Z14-NEXT: vfchsb %v0, %v24, %v26
-; CHECK-Z14-NEXT: vuphf %v0, %v0
-; CHECK-Z14-NEXT: vsel %v24, %v28, %v30, %v0
-; CHECK-Z14-NEXT: br %r14
-
%cmp = fcmp ogt <2 x float> %val1, %val2
%sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
ret <2 x double> %sel
@@ -387,25 +372,18 @@ define <4 x float> @fun28(<4 x float> %val1, <4 x float> %val2, <4 x float> %val
; CHECK-LABEL: fun28:
; CHECK: # %bb.0:
; CHECK-NEXT: vmrlf %v0, %v26, %v26
-; CHECK-NEXT: vmrlf %v1, %v24, %v24
-; CHECK-NEXT: vldeb %v0, %v0
-; CHECK-NEXT: vldeb %v1, %v1
-; CHECK-NEXT: vfchdb %v0, %v1, %v0
; CHECK-NEXT: vmrhf %v1, %v26, %v26
-; CHECK-NEXT: vmrhf %v2, %v24, %v24
-; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vmrlf %v2, %v24, %v24
+; CHECK-NEXT: vmrhf %v3, %v24, %v24
+; CHECK-NEXT: vldeb %v0, %v0
; CHECK-NEXT: vldeb %v2, %v2
-; CHECK-NEXT: vfchdb %v1, %v2, %v1
+; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vldeb %v3, %v3
+; CHECK-NEXT: vfchdb %v0, %v2, %v0
+; CHECK-NEXT: vfchdb %v1, %v3, %v1
; CHECK-NEXT: vpkg %v0, %v1, %v0
; CHECK-NEXT: vsel %v24, %v28, %v30, %v0
; CHECK-NEXT: br %r14
-
-; CHECK-Z14-LABEL: fun28:
-; CHECK-Z14: # %bb.0:
-; CHECK-Z14-NEXT: vfchsb %v0, %v24, %v26
-; CHECK-Z14-NEXT: vsel %v24, %v28, %v30, %v0
-; CHECK-Z14-NEXT: br %r14
-
%cmp = fcmp ogt <4 x float> %val1, %val2
%sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
ret <4 x float> %sel
@@ -415,44 +393,55 @@ define <4 x double> @fun29(<4 x float> %val1, <4 x float> %val2, <4 x double> %v
; CHECK-LABEL: fun29:
; CHECK: # %bb.0:
; CHECK-NEXT: vmrlf %v0, %v26, %v26
-; CHECK-NEXT: vmrlf %v1, %v24, %v24
-; CHECK-NEXT: vldeb %v0, %v0
-; CHECK-NEXT: vldeb %v1, %v1
-; CHECK-NEXT: vfchdb %v0, %v1, %v0
; CHECK-NEXT: vmrhf %v1, %v26, %v26
-; CHECK-NEXT: vmrhf %v2, %v24, %v24
-; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vmrlf %v2, %v24, %v24
+; CHECK-NEXT: vmrhf %v3, %v24, %v24
+; CHECK-NEXT: vldeb %v0, %v0
; CHECK-NEXT: vldeb %v2, %v2
-; CHECK-NEXT: vfchdb %v1, %v2, %v1
-; CHECK-NEXT: vpkg [[REG0:%v[0-9]+]], %v1, %v0
-; CHECK-DAG: vuplf [[REG1:%v[0-9]+]], [[REG0]]
-; CHECK-DAG: vuphf [[REG2:%v[0-9]+]], [[REG0]]
-; CHECK-NEXT: vsel %v24, %v28, %v25, [[REG2]]
-; CHECK-NEXT: vsel %v26, %v30, %v27, [[REG1]]
-; CHECK-NEXT: br %r14
-
-; CHECK-Z14-LABEL: fun29:
-; CHECK-Z14: # %bb.0:
-; CHECK-Z14-NEXT: vfchsb %v0, %v24, %v26
-; CHECK-Z14-DAG: vuphf [[REG0:%v[0-9]+]], %v0
+; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vldeb %v3, %v3
+; CHECK-NEXT: vfchdb %v0, %v2, %v0
+; CHECK-NEXT: vfchdb %v1, %v3, %v1
+; CHECK-NEXT: vpkg %v0, %v1, %v0
+; CHECK-NEXT: vuplf %v1, %v0
+; CHECK-NEXT: vuphf %v0, %v0
+; CHECK-NEXT: vsel %v24, %v28, %v25, %v0
+; CHECK-NEXT: vsel %v26, %v30, %v27, %v1
+; CHECK-NEXT: br %r14
; CHECK-Z14-DAG: vuplf [[REG1:%v[0-9]+]], %v0
-; CHECK-Z14-NEXT: vsel %v24, %v28, %v25, [[REG0]]
-; CHECK-Z14-NEXT: vsel %v26, %v30, %v27, [[REG1]]
-; CHECK-Z14-NEXT: br %r14
-
%cmp = fcmp ogt <4 x float> %val1, %val2
%sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
ret <4 x double> %sel
}
define <8 x float> @fun30(<8 x float> %val1, <8 x float> %val2, <8 x float> %val3, <8 x float> %val4) {
-; CHECK-Z14-LABEL: fun30:
-; CHECK-Z14: # %bb.0:
-; CHECK-Z14-DAG: vfchsb [[REG0:%v[0-9]+]], %v26, %v30
-; CHECK-Z14-DAG: vfchsb [[REG1:%v[0-9]+]], %v24, %v28
-; CHECK-Z14-DAG: vsel %v24, %v25, %v29, [[REG1]]
-; CHECK-Z14-DAG: vsel %v26, %v27, %v31, [[REG0]]
-; CHECK-Z14-NEXT: br %r14
+; CHECK-LABEL: fun30:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmrlf %v0, %v30, %v30
+; CHECK-NEXT: vmrhf %v1, %v30, %v30
+; CHECK-NEXT: vmrlf %v2, %v26, %v26
+; CHECK-NEXT: vmrhf %v3, %v26, %v26
+; CHECK-NEXT: vmrlf %v4, %v28, %v28
+; CHECK-NEXT: vmrhf %v5, %v28, %v28
+; CHECK-NEXT: vmrlf %v6, %v24, %v24
+; CHECK-NEXT: vmrhf %v7, %v24, %v24
+; CHECK-NEXT: vldeb %v0, %v0
+; CHECK-NEXT: vldeb %v2, %v2
+; CHECK-NEXT: vfchdb %v0, %v2, %v0
+; CHECK-NEXT: vldeb %v1, %v1
+; CHECK-NEXT: vldeb %v3, %v3
+; CHECK-NEXT: vfchdb %v1, %v3, %v1
+; CHECK-NEXT: vpkg %v0, %v1, %v0
+; CHECK-NEXT: vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT: vldeb %v4, %v4
+; CHECK-NEXT: vldeb %v6, %v6
+; CHECK-NEXT: vfchdb %v2, %v6, %v4
+; CHECK-NEXT: vldeb %v5, %v5
+; CHECK-NEXT: vldeb %v7, %v7
+; CHECK-NEXT: vfchdb %v3, %v7, %v5
+; CHECK-NEXT: vpkg %v1, %v3, %v2
+; CHECK-NEXT: vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT: br %r14
%cmp = fcmp ogt <8 x float> %val1, %val2
%sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
ret <8 x float> %sel
@@ -498,10 +487,10 @@ define <4 x float> @fun33(<4 x double> %val1, <4 x double> %val2, <4 x float> %v
define <4 x double> @fun34(<4 x double> %val1, <4 x double> %val2, <4 x double> %val3, <4 x double> %val4) {
; CHECK-LABEL: fun34:
; CHECK: # %bb.0:
-; CHECK-DAG: vfchdb [[REG0:%v[0-9]+]], %v26, %v30
-; CHECK-DAG: vfchdb [[REG1:%v[0-9]+]], %v24, %v28
-; CHECK-DAG: vsel %v24, %v25, %v29, [[REG1]]
-; CHECK-DAG: vsel %v26, %v27, %v31, [[REG0]]
+; CHECK-NEXT: vfchdb %v0, %v26, %v30
+; CHECK-NEXT: vfchdb %v1, %v24, %v28
+; CHECK-NEXT: vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT: vsel %v26, %v27, %v31, %v0
; CHECK-NEXT: br %r14
%cmp = fcmp ogt <4 x double> %val1, %val2
%sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
diff --git a/llvm/test/CodeGen/SystemZ/vec-cmpsel-02.ll b/llvm/test/CodeGen/SystemZ/vec-cmpsel-02.ll
new file mode 100644
index 0000000000000..9daf5c984c041
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-cmpsel-02.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test that vector float compare / select combinations do not produce any
+; unnecessary pack /unpack / shift instructions.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+
+define <2 x float> @fun0(<2 x float> %val1, <2 x float> %val2, <2 x float> %val3, <2 x float> %val4) {
+; CHECK-LABEL: fun0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfchsb %v0, %v24, %v26
+; CHECK-NEXT: vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT: br %r14
+
+ %cmp = fcmp ogt <2 x float> %val1, %val2
+ %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+ ret <2 x float> %sel
+}
+
+define <2 x double> @fun1(<2 x float> %val1, <2 x float> %val2, <2 x double> %val3, <2 x double> %val4) {
+; CHECK-LABEL: fun1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfchsb %v0, %v24, %v26
+; CHECK-NEXT: vuphf %v0, %v0
+; CHECK-NEXT: vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT: br %r14
+
+ %cmp = fcmp ogt <2 x float> %val1, %val2
+ %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+ ret <2 x double> %sel
+}
+
+define <4 x float> @fun2(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: fun2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfchsb %v0, %v24, %v26
+; CHECK-NEXT: vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT: br %r14
+
+ %cmp = fcmp ogt <4 x float> %val1, %val2
+ %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+ ret <4 x float> %sel
+}
+
+define <4 x double> @fun3(<4 x float> %val1, <4 x float> %val2, <4 x double> %val3, <4 x double> %val4) {
+; CHECK-LABEL: fun3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfchsb %v0, %v24, %v26
+; CHECK-NEXT: vuplf %v1, %v0
+; CHECK-NEXT: vuphf %v0, %v0
+; CHECK-NEXT: vsel %v24, %v28, %v25, %v0
+; CHECK-NEXT: vsel %v26, %v30, %v27, %v1
+; CHECK-NEXT: br %r14
+ %cmp = fcmp ogt <4 x float> %val1, %val2
+ %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+ ret <4 x double> %sel
+}
+
+define <8 x float> @fun4(<8 x float> %val1, <8 x float> %val2, <8 x float> %val3, <8 x float> %val4) {
+; CHECK-Z14-LABEL: fun4:
+; CHECK-LABEL: fun4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfchsb %v0, %v26, %v30
+; CHECK-NEXT: vfchsb %v1, %v24, %v28
+; CHECK-NEXT: vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT: vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT: br %r14
+ %cmp = fcmp ogt <8 x float> %val1, %val2
+ %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+ ret <8 x float> %sel
+}
diff --git a/llvm/test/CodeGen/SystemZ/vec-eval.ll b/llvm/test/CodeGen/SystemZ/vec-eval.ll
index bcdedcd3a407b..f88531e82bc25 100644
--- a/llvm/test/CodeGen/SystemZ/vec-eval.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-eval.ll
@@ -1540,12 +1540,13 @@ define <16 x i8> @eval109(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
; CHECK-LABEL: eval109:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vgbm %v0, 65535
+; CHECK-NEXT: vn %v1, %v26, %v24
; CHECK-NEXT: veval %v0, %v24, %v0, %v26, 40
-; CHECK-NEXT: vn %v2, %v26, %v24
-; CHECK-NEXT: veval %v0, %v28, %v0, %v2, 7
+; CHECK-NEXT: vo %v2, %v28, %v24
+; CHECK-NEXT: veval %v0, %v28, %v0, %v1, 7
; CHECK-NEXT: vo %v1, %v28, %v24
; CHECK-NEXT: veval %v0, %v0, %v24, %v26, 47
-; CHECK-NEXT: veval %v24, %v0, %v26, %v1, 47
+; CHECK-NEXT: veval %v24, %v0, %v26, %v2, 47
; CHECK-NEXT: br %r14
entry:
%not = xor <16 x i8> %src1, splat(i8 -1)
@@ -1753,12 +1754,13 @@ define <16 x i8> @eval121(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
; CHECK-LABEL: eval121:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vgbm %v0, 65535
+; CHECK-NEXT: vn %v1, %v26, %v24
; CHECK-NEXT: veval %v0, %v24, %v0, %v26, 40
-; CHECK-NEXT: vn %v2, %v26, %v24
-; CHECK-NEXT: veval %v0, %v28, %v0, %v2, 7
+; CHECK-NEXT: vo %v2, %v28, %v26
+; CHECK-NEXT: veval %v0, %v28, %v0, %v1, 7
; CHECK-NEXT: vo %v1, %v28, %v26
; CHECK-NEXT: veval %v0, %v0, %v26, %v24, 47
-; CHECK-NEXT: veval %v24, %v0, %v24, %v1, 47
+; CHECK-NEXT: veval %v24, %v0, %v24, %v2, 47
; CHECK-NEXT: br %r14
entry:
%not = xor <16 x i8> %src1, splat(i8 -1)
diff --git a/llvm/test/CodeGen/SystemZ/vec-move-23.ll b/llvm/test/CodeGen/SystemZ/vec-move-23.ll
index 1976e6710ecf9..547e8eb17169e 100644
--- a/llvm/test/CodeGen/SystemZ/vec-move-23.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-move-23.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s -check-prefixes=CHECK,Z14
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s -check-prefixes=CHECK,Z15
;
@@ -5,12 +6,13 @@
define void @fun0(<2 x i8> %Src, ptr %Dst) {
; CHECK-LABEL: fun0:
-; CHECK: vuphb %v0, %v24
-; CHECK-NEXT: vuphh %v0, %v0
-; CHECK-NEXT: vuphf %v0, %v0
-; CHECK-NEXT: vcdgb %v0, %v0, 0, 0
-; CHECK-NEXT: vst %v0, 0(%r2), 3
-; CHECK-NEXT: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: vuphb %v0, %v24
+; CHECK-NEXT: vuphh %v0, %v0
+; CHECK-NEXT: vuphf %v0, %v0
+; CHECK-NEXT: vcdgb %v0, %v0, 0, 0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
%c = sitofp <2 x i8> %Src to <2 x double>
store <2 x double> %c, ptr %Dst
ret void
@@ -18,11 +20,12 @@ define void @fun0(<2 x i8> %Src, ptr %Dst) {
define void @fun1(<2 x i16> %Src, ptr %Dst) {
; CHECK-LABEL: fun1:
-; CHECK: vuphh %v0, %v24
-; CHECK-NEXT: vuphf %v0, %v0
-; CHECK-NEXT: vcdgb %v0, %v0, 0, 0
-; CHECK-NEXT: vst %v0, 0(%r2), 3
-; CHECK-NEXT: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: vuphh %v0, %v24
+; CHECK-NEXT: vuphf %v0, %v0
+; CHECK-NEXT: vcdgb %v0, %v0, 0, 0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
%c = sitofp <2 x i16> %Src to <2 x double>
store <2 x double> %c, ptr %Dst
ret void
@@ -30,37 +33,42 @@ define void @fun1(<2 x i16> %Src, ptr %Dst) {
define void @fun2(<2 x i32> %Src, ptr %Dst) {
; CHECK-LABEL: fun2:
-; CHECK: vuphf %v0, %v24
-; CHECK-NEXT: vcdgb %v0, %v0, 0, 0
-; CHECK-NEXT: vst %v0, 0(%r2), 3
-; CHECK-NEXT: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: vuphf %v0, %v24
+; CHECK-NEXT: vcdgb %v0, %v0, 0, 0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
%c = sitofp <2 x i32> %Src to <2 x double>
store <2 x double> %c, ptr %Dst
ret void
}
define void @fun3(<4 x i16> %Src, ptr %Dst) {
-; CHECK-LABEL: fun3:
+; Z14-LABEL: fun3:
+; Z14: # %bb.0:
+; Z14-NEXT: vuphh %v0, %v24
+; Z14-NEXT: vlgvf %r0, %v0, 3
+; Z14-NEXT: vlgvf %r1, %v0, 2
+; Z14-NEXT: cefbr %f1, %r1
+; Z14-NEXT: vlgvf %r3, %v0, 1
+; Z14-NEXT: cefbr %f2, %r3
+; Z14-NEXT: vlgvf %r4, %v0, 0
+; Z14-NEXT: cefbr %f0, %r0
+; Z14-NEXT: vmrhf %v0, %v1, %v0
+; Z14-NEXT: cefbr %f3, %r4
+; Z14-NEXT: vmrhf %v1, %v3, %v2
+; Z14-NEXT: vmrhg %v0, %v1, %v0
+; Z14-NEXT: vst %v0, 0(%r2), 3
+; Z14-NEXT: br %r14
+;
+; Z15-LABEL: fun3:
+; Z15: # %bb.0:
+; Z15-NEXT: vuphh %v0, %v24
+; Z15-NEXT: vcefb %v0, %v0, 0, 0
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
-; Z14: vuphh %v0, %v24
-; Z14-NEXT: vlgvf %r0, %v0, 3
-; Z14-NEXT: cefbr %f1, %r0
-; Z14-NEXT: vlgvf %r0, %v0, 2
-; Z14-NEXT: cefbr %f2, %r0
-; Z14-NEXT: vlgvf %r0, %v0, 1
-; Z14-NEXT: vmrhf %v1, %v2, %v1
-; Z14-NEXT: cefbr %f2, %r0
-; Z14-NEXT: vlgvf %r0, %v0, 0
-; Z14-NEXT: cefbr %f0, %r0
-; Z14-NEXT: vmrhf %v0, %v0, %v2
-; Z14-NEXT: vmrhg %v0, %v0, %v1
-; Z14-NEXT: vst %v0, 0(%r2), 3
-; Z14-NEXT: br %r14
-; Z15: vuphh %v0, %v24
-; Z15-NEXT: vcefb %v0, %v0, 0, 0
-; Z15-NEXT: vst %v0, 0(%r2), 3
-; Z15-NEXT: br %r14
%c = sitofp <4 x i16> %Src to <4 x float>
store <4 x float> %c, ptr %Dst
ret void
@@ -68,12 +76,13 @@ define void @fun3(<4 x i16> %Src, ptr %Dst) {
define void @fun4(<2 x i8> %Src, ptr %Dst) {
; CHECK-LABEL: fun4:
-; CHECK: larl %r1, .LCPI4_0
-; CHECK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-NEXT: vperm %v0, %v0, %v24, %v0
-; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
-; CHECK-NEXT: vst %v0, 0(%r2), 3
-; CHECK-NEXT: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI4_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v0, %v0, %v24, %v0
+; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
%c = uitofp <2 x i8> %Src to <2 x double>
store <2 x double> %c, ptr %Dst
ret void
@@ -81,12 +90,13 @@ define void @fun4(<2 x i8> %Src, ptr %Dst) {
define void @fun5(<2 x i16> %Src, ptr %Dst) {
; CHECK-LABEL: fun5:
-; CHECK: larl %r1, .LCPI5_0
-; CHECK-NEXT: vl %v0, 0(%r1), 3
-; CHECK-NEXT: vperm %v0, %v0, %v24, %v0
-; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
-; CHECK-NEXT: vst %v0, 0(%r2), 3
-; CHECK-NEXT: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: larl %r1, .LCPI5_0
+; CHECK-NEXT: vl %v0, 0(%r1), 3
+; CHECK-NEXT: vperm %v0, %v0, %v24, %v0
+; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
%c = uitofp <2 x i16> %Src to <2 x double>
store <2 x double> %c, ptr %Dst
ret void
@@ -94,37 +104,42 @@ define void @fun5(<2 x i16> %Src, ptr %Dst) {
define void @fun6(<2 x i32> %Src, ptr %Dst) {
; CHECK-LABEL: fun6:
-; CHECK: vuplhf %v0, %v24
-; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
-; CHECK-NEXT: vst %v0, 0(%r2), 3
-; CHECK-NEXT: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: vuplhf %v0, %v24
+; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
+; CHECK-NEXT: vst %v0, 0(%r2), 3
+; CHECK-NEXT: br %r14
%c = uitofp <2 x i32> %Src to <2 x double>
store <2 x double> %c, ptr %Dst
ret void
}
define void @fun7(<4 x i16> %Src, ptr %Dst) {
-; CHECK-LABEL: fun7:
+; Z14-LABEL: fun7:
+; Z14: # %bb.0:
+; Z14-NEXT: vuplhh %v0, %v24
+; Z14-NEXT: vlgvf %r0, %v0, 3
+; Z14-NEXT: vlgvf %r1, %v0, 2
+; Z14-NEXT: celfbr %f1, 0, %r1, 0
+; Z14-NEXT: vlgvf %r3, %v0, 1
+; Z14-NEXT: celfbr %f2, 0, %r3, 0
+; Z14-NEXT: vlgvf %r4, %v0, 0
+; Z14-NEXT: celfbr %f0, 0, %r0, 0
+; Z14-NEXT: vmrhf %v0, %v1, %v0
+; Z14-NEXT: celfbr %f3, 0, %r4, 0
+; Z14-NEXT: vmrhf %v1, %v3, %v2
+; Z14-NEXT: vmrhg %v0, %v1, %v0
+; Z14-NEXT: vst %v0, 0(%r2), 3
+; Z14-NEXT: br %r14
+;
+; Z15-LABEL: fun7:
+; Z15: # %bb.0:
+; Z15-NEXT: vuplhh %v0, %v24
+; Z15-NEXT: vcelfb %v0, %v0, 0, 0
+; Z15-NEXT: vst %v0, 0(%r2), 3
+; Z15-NEXT: br %r14
-; Z14: vuplhh %v0, %v24
-; Z14-NEXT: vlgvf %r0, %v0, 3
-; Z14-NEXT: celfbr %f1, 0, %r0, 0
-; Z14-NEXT: vlgvf %r0, %v0, 2
-; Z14-NEXT: celfbr %f2, 0, %r0, 0
-; Z14-NEXT: vlgvf %r0, %v0, 1
-; Z14-NEXT: vmrhf %v1, %v2, %v1
-; Z14-NEXT: celfbr %f2, 0, %r0, 0
-; Z14-NEXT: vlgvf %r0, %v0, 0
-; Z14-NEXT: celfbr %f0, 0, %r0, 0
-; Z14-NEXT: vmrhf %v0, %v0, %v2
-; Z14-NEXT: vmrhg %v0, %v0, %v1
-; Z14-NEXT: vst %v0, 0(%r2), 3
-; Z14-NEXT: br %r14
-; Z15: vuplhh %v0, %v24
-; Z15-NEXT: vcelfb %v0, %v0, 0, 0
-; Z15-NEXT: vst %v0, 0(%r2), 3
-; Z15-NEXT: br %r14
%c = uitofp <4 x i16> %Src to <4 x float>
store <4 x float> %c, ptr %Dst
ret void
@@ -132,9 +147,17 @@ define void @fun7(<4 x i16> %Src, ptr %Dst) {
; Test that this does not crash but results in scalarized conversions.
define void @fun8(<2 x i64> %dwords, ptr %ptr) {
-; CHECK-LABEL: fun8
-; CHECK: vlgvg
-; CHECK: cxlgbr
+; CHECK-LABEL: fun8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vlgvg %r0, %v24, 0
+; CHECK-NEXT: vlgvg %r1, %v24, 1
+; CHECK-NEXT: cxlgbr %f0, 0, %r0, 0
+; CHECK-NEXT: cxlgbr %f1, 0, %r1, 0
+; CHECK-NEXT: vmrhg %v1, %v1, %v3
+; CHECK-NEXT: vmrhg %v0, %v0, %v2
+; CHECK-NEXT: vst %v1, 16(%r2), 4
+; CHECK-NEXT: vst %v0, 0(%r2), 4
+; CHECK-NEXT: br %r14
%conv = uitofp <2 x i64> %dwords to <2 x fp128>
store <2 x fp128> %conv, ptr %ptr
ret void
@@ -142,19 +165,76 @@ define void @fun8(<2 x i64> %dwords, ptr %ptr) {
; Test that this results in vectorized conversions.
define void @fun9(ptr %Src, ptr %ptr) {
-; CHECK-LABEL: fun9
-; Z15: vl %v0, 16(%r2), 4
-; Z15-NEXT: vl %v1, 0(%r2), 4
-; Z15-NEXT: vuplhh %v2, %v1
-; Z15-NEXT: vupllh %v1, %v1
-; Z15-NEXT: vuplhh %v0, %v0
-; Z15-NEXT: vcelfb %v2, %v2, 0, 0
-; Z15-NEXT: vcelfb %v1, %v1, 0, 0
-; Z15-NEXT: vcelfb %v0, %v0, 0, 0
-; Z15-NEXT: vsteg %v0, 32(%r3), 0
-; Z15-NEXT: vst %v1, 16(%r3), 4
-; Z15-NEXT: vst %v2, 0(%r3), 4
-; Z15-NEXT: br %r14
+; Z14-LABEL: fun9:
+; Z14: # %bb.0:
+; Z14-NEXT: stmg %r10, %r15, 80(%r15)
+; Z14-NEXT: .cfi_offset %r10, -80
+; Z14-NEXT: .cfi_offset %r11, -72
+; Z14-NEXT: .cfi_offset %r12, -64
+; Z14-NEXT: .cfi_offset %r13, -56
+; Z14-NEXT: .cfi_offset %r14, -48
+; Z14-NEXT: .cfi_offset %r15, -40
+; Z14-NEXT: aghi %r15, -16
+; Z14-NEXT: .cfi_def_cfa_offset 176
+; Z14-NEXT: std %f8, 8(%r15) # 8-byte Spill
+; Z14-NEXT: std %f9, 0(%r15) # 8-byte Spill
+; Z14-NEXT: .cfi_offset %f8, -168
+; Z14-NEXT: .cfi_offset %f9, -176
+; Z14-NEXT: vl %v0, 16(%r2), 4
+; Z14-NEXT: vl %v1, 0(%r2), 4
+; Z14-NEXT: vuplhh %v2, %v1
+; Z14-NEXT: vupllh %v1, %v1
+; Z14-NEXT: vuplhh %v0, %v0
+; Z14-NEXT: vlgvf %r0, %v2, 3
+; Z14-NEXT: vlgvf %r1, %v2, 2
+; Z14-NEXT: vlgvf %r2, %v2, 1
+; Z14-NEXT: vlgvf %r4, %v2, 0
+; Z14-NEXT: celfbr %f2, 0, %r2, 0
+; Z14-NEXT: vlgvf %r5, %v1, 3
+; Z14-NEXT: celfbr %f3, 0, %r4, 0
+; Z14-NEXT: vlgvf %r14, %v1, 2
+; Z14-NEXT: celfbr %f4, 0, %r5, 0
+; Z14-NEXT: vlgvf %r13, %v1, 1
+; Z14-NEXT: celfbr %f5, 0, %r14, 0
+; Z14-NEXT: vlgvf %r12, %v1, 0
+; Z14-NEXT: celfbr %f1, 0, %r1, 0
+; Z14-NEXT: vlgvf %r11, %v0, 1
+; Z14-NEXT: celfbr %f6, 0, %r13, 0
+; Z14-NEXT: vlgvf %r10, %v0, 0
+; Z14-NEXT: celfbr %f0, 0, %r0, 0
+; Z14-NEXT: vmrhf %v0, %v1, %v0
+; Z14-NEXT: celfbr %f7, 0, %r12, 0
+; Z14-NEXT: vmrhf %v1, %v3, %v2
+; Z14-NEXT: celfbr %f8, 0, %r11, 0
+; Z14-NEXT: vmrhf %v2, %v5, %v4
+; Z14-NEXT: celfbr %f9, 0, %r10, 0
+; Z14-NEXT: vmrhf %v3, %v7, %v6
+; Z14-NEXT: vmrhf %v4, %v9, %v8
+; Z14-NEXT: ld %f8, 8(%r15) # 8-byte Reload
+; Z14-NEXT: ld %f9, 0(%r15) # 8-byte Reload
+; Z14-NEXT: vmrhg %v0, %v1, %v0
+; Z14-NEXT: vmrhg %v1, %v3, %v2
+; Z14-NEXT: vmrhg %v2, %v4, %v4
+; Z14-NEXT: vsteg %v2, 32(%r3), 0
+; Z14-NEXT: vst %v1, 16(%r3), 4
+; Z14-NEXT: vst %v0, 0(%r3), 4
+; Z14-NEXT: lmg %r10, %r15, 96(%r15)
+; Z14-NEXT: br %r14
+;
+; Z15-LABEL: fun9:
+; Z15: # %bb.0:
+; Z15-NEXT: vl %v0, 16(%r2), 4
+; Z15-NEXT: vl %v1, 0(%r2), 4
+; Z15-NEXT: vuplhh %v2, %v1
+; Z15-NEXT: vupllh %v1, %v1
+; Z15-NEXT: vuplhh %v0, %v0
+; Z15-NEXT: vcelfb %v2, %v2, 0, 0
+; Z15-NEXT: vcelfb %v1, %v1, 0, 0
+; Z15-NEXT: vcelfb %v0, %v0, 0, 0
+; Z15-NEXT: vsteg %v0, 32(%r3), 0
+; Z15-NEXT: vst %v1, 16(%r3), 4
+; Z15-NEXT: vst %v2, 0(%r3), 4
+; Z15-NEXT: br %r14
%Val = load <10 x i16>, ptr %Src
%conv = uitofp <10 x i16> %Val to <10 x float>
diff --git a/llvm/test/CodeGen/SystemZ/vec-sub-01.ll b/llvm/test/CodeGen/SystemZ/vec-sub-01.ll
index e1e08ebaaef47..08062b35e30e9 100644
--- a/llvm/test/CodeGen/SystemZ/vec-sub-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-sub-01.ll
@@ -44,20 +44,20 @@ define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
; the VSLDBs use the result of the VLRs or use %v24 and %v26 directly.
define <4 x float> @f5(<4 x float> %val1, <4 x float> %val2) {
; CHECK-LABEL: f5:
-; CHECK-DAG: vlr %v[[A1:[0-5]]], %v24
-; CHECK-DAG: vlr %v[[A2:[0-5]]], %v26
-; CHECK-DAG: vrepf %v[[B1:[0-5]]], %v24, 1
-; CHECK-DAG: vrepf %v[[B2:[0-5]]], %v26, 1
-; CHECK-DAG: vrepf %v[[C1:[0-5]]], %v24, 2
+; CHECK-DAG: vrepf %v[[B2:[0-5]]], %v26, 3
+; CHECK-DAG: vrepf %v[[B1:[0-5]]], %v24, 3
; CHECK-DAG: vrepf %v[[C2:[0-5]]], %v26, 2
-; CHECK-DAG: vrepf %v[[D1:[0-5]]], %v24, 3
-; CHECK-DAG: vrepf %v[[D2:[0-5]]], %v26, 3
-; CHECK-DAG: sebr %f[[A1]], %f[[A2]]
+; CHECK-DAG: vrepf %v[[C1:[0-5]]], %v24, 2
+; CHECK-DAG: vrepf %v[[D2:[0-7]]], %v26, 1
+; CHECK-DAG: vrepf %v[[D1:[0-7]]], %v24, 1
+; CHECK-DAG: vlr %v[[A2:[0-5]]], %v26
+; CHECK-DAG: vlr %v[[A1:[0-5]]], %v24
; CHECK-DAG: sebr %f[[B1]], %f[[B2]]
; CHECK-DAG: sebr %f[[C1]], %f[[C2]]
+; CHECK-DAG: sebr %f[[A1]], %f[[A2]]
; CHECK-DAG: sebr %f[[D1]], %f[[D2]]
-; CHECK-DAG: vmrhf [[HIGH:%v[0-9]+]], %v[[A1]], %v[[B1]]
-; CHECK-DAG: vmrhf [[LOW:%v[0-9]+]], %v[[C1]], %v[[D1]]
+; CHECK-DAG: vmrhf [[LOW:%v[0-9]+]], %v[[C1]], %v[[B1]]
+; CHECK-DAG: vmrhf [[HIGH:%v[0-9]+]], %v[[A1]], %v[[D1]]
; CHECK: vmrhg %v24, [[HIGH]], [[LOW]]
; CHECK: br %r14
%ret = fsub <4 x float> %val1, %val2
diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
index 614f7b243c7e2..99e3c8aaa4a64 100644
--- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
@@ -76,9 +76,9 @@ define <3 x float> @constrained_vector_fdiv_v3f32() #0 {
; SZ13-LABEL: constrained_vector_fdiv_v3f32:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI2_0
+; SZ13-NEXT: larl %r2, .LCPI2_1
; SZ13-NEXT: lde %f0, 0(%r1)
-; SZ13-NEXT: larl %r1, .LCPI2_1
-; SZ13-NEXT: lde %f1, 0(%r1)
+; SZ13-NEXT: lde %f1, 0(%r2)
; SZ13-NEXT: debr %f1, %f0
; SZ13-NEXT: vgmf %v2, 2, 8
; SZ13-NEXT: vgmf %v3, 1, 1
@@ -116,15 +116,15 @@ define void @constrained_vector_fdiv_v3f64(ptr %a) #0 {
;
; SZ13-LABEL: constrained_vector_fdiv_v3f64:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: larl %r1, .LCPI3_0
-; SZ13-NEXT: ld %f1, 0(%r1)
-; SZ13-NEXT: ddb %f1, 16(%r2)
; SZ13-NEXT: larl %r1, .LCPI3_1
; SZ13-NEXT: vl %v0, 0(%r2), 4
-; SZ13-NEXT: vl %v2, 0(%r1), 3
-; SZ13-NEXT: std %f1, 16(%r2)
-; SZ13-NEXT: vfddb %v0, %v2, %v0
+; SZ13-NEXT: vl %v1, 0(%r1), 3
+; SZ13-NEXT: vfddb %v0, %v1, %v0
+; SZ13-NEXT: larl %r1, .LCPI3_0
+; SZ13-NEXT: ld %f2, 0(%r1)
+; SZ13-NEXT: ddb %f2, 16(%r2)
; SZ13-NEXT: vst %v0, 0(%r2), 4
+; SZ13-NEXT: std %f2, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, ptr %a
@@ -159,13 +159,13 @@ define <4 x double> @constrained_vector_fdiv_v4f64() #0 {
; SZ13-LABEL: constrained_vector_fdiv_v4f64:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI4_0
+; SZ13-NEXT: larl %r2, .LCPI4_1
+; SZ13-NEXT: larl %r3, .LCPI4_2
; SZ13-NEXT: vl %v0, 0(%r1), 3
-; SZ13-NEXT: larl %r1, .LCPI4_1
-; SZ13-NEXT: vl %v1, 0(%r1), 3
+; SZ13-NEXT: vl %v1, 0(%r2), 3
; SZ13-NEXT: vfddb %v26, %v1, %v0
-; SZ13-NEXT: larl %r1, .LCPI4_2
-; SZ13-NEXT: vl %v1, 0(%r1), 3
-; SZ13-NEXT: vfddb %v24, %v1, %v0
+; SZ13-NEXT: vl %v2, 0(%r3), 3
+; SZ13-NEXT: vfddb %v24, %v2, %v0
; SZ13-NEXT: br %r14
entry:
%div = call <4 x double> @llvm.experimental.constrained.fdiv.v4f64(
@@ -628,15 +628,15 @@ define <3 x float> @constrained_vector_fmul_v3f32() #0 {
; SZ13-LABEL: constrained_vector_fmul_v3f32:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: vgmf %v0, 1, 8
+; SZ13-NEXT: vgmf %v1, 2, 8
; SZ13-NEXT: larl %r1, .LCPI12_0
-; SZ13-NEXT: vgmf %v2, 2, 8
-; SZ13-NEXT: vgmf %v1, 1, 8
-; SZ13-NEXT: meeb %f1, 0(%r1)
-; SZ13-NEXT: larl %r1, .LCPI12_1
-; SZ13-NEXT: meebr %f2, %f0
-; SZ13-NEXT: meeb %f0, 0(%r1)
-; SZ13-NEXT: vmrhf %v0, %v2, %v0
-; SZ13-NEXT: vrepf %v1, %v1, 0
+; SZ13-NEXT: larl %r2, .LCPI12_1
+; SZ13-NEXT: vgmf %v2, 1, 8
+; SZ13-NEXT: meeb %f2, 0(%r1)
+; SZ13-NEXT: meebr %f1, %f0
+; SZ13-NEXT: meeb %f0, 0(%r2)
+; SZ13-NEXT: vmrhf %v0, %v1, %v0
+; SZ13-NEXT: vrepf %v1, %v2, 0
; SZ13-NEXT: vmrhg %v24, %v0, %v1
; SZ13-NEXT: br %r14
entry:
@@ -666,15 +666,15 @@ define void @constrained_vector_fmul_v3f64(ptr %a) #0 {
;
; SZ13-LABEL: constrained_vector_fmul_v3f64:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: larl %r1, .LCPI13_0
-; SZ13-NEXT: ld %f1, 0(%r1)
; SZ13-NEXT: larl %r1, .LCPI13_1
; SZ13-NEXT: vl %v0, 0(%r2), 4
-; SZ13-NEXT: vl %v2, 0(%r1), 3
-; SZ13-NEXT: mdb %f1, 16(%r2)
-; SZ13-NEXT: vfmdb %v0, %v2, %v0
+; SZ13-NEXT: larl %r3, .LCPI13_0
+; SZ13-NEXT: vl %v1, 0(%r1), 3
+; SZ13-NEXT: ld %f2, 0(%r3)
+; SZ13-NEXT: mdb %f2, 16(%r2)
+; SZ13-NEXT: vfmdb %v0, %v1, %v0
; SZ13-NEXT: vst %v0, 0(%r2), 4
-; SZ13-NEXT: std %f1, 16(%r2)
+; SZ13-NEXT: std %f2, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, ptr %a
@@ -709,13 +709,13 @@ define <4 x double> @constrained_vector_fmul_v4f64() #0 {
; SZ13-LABEL: constrained_vector_fmul_v4f64:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI14_0
+; SZ13-NEXT: larl %r2, .LCPI14_1
+; SZ13-NEXT: larl %r3, .LCPI14_2
; SZ13-NEXT: vl %v0, 0(%r1), 3
-; SZ13-NEXT: larl %r1, .LCPI14_1
-; SZ13-NEXT: vl %v1, 0(%r1), 3
-; SZ13-NEXT: larl %r1, .LCPI14_2
+; SZ13-NEXT: vl %v1, 0(%r2), 3
+; SZ13-NEXT: vl %v2, 0(%r3), 3
; SZ13-NEXT: vfmdb %v26, %v1, %v0
-; SZ13-NEXT: vl %v0, 0(%r1), 3
-; SZ13-NEXT: vfmdb %v24, %v1, %v0
+; SZ13-NEXT: vfmdb %v24, %v1, %v2
; SZ13-NEXT: br %r14
entry:
%mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64(
@@ -799,14 +799,14 @@ define <3 x float> @constrained_vector_fadd_v3f32() #0 {
; SZ13-LABEL: constrained_vector_fadd_v3f32:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: vgbm %v0, 61440
-; SZ13-NEXT: vgmf %v2, 1, 1
-; SZ13-NEXT: vgmf %v3, 2, 8
-; SZ13-NEXT: lzer %f1
+; SZ13-NEXT: vgmf %v1, 1, 1
+; SZ13-NEXT: vgmf %v2, 2, 8
+; SZ13-NEXT: lzer %f3
+; SZ13-NEXT: aebr %f3, %f0
; SZ13-NEXT: aebr %f1, %f0
; SZ13-NEXT: aebr %f2, %f0
-; SZ13-NEXT: aebr %f3, %f0
-; SZ13-NEXT: vmrhf %v0, %v2, %v3
-; SZ13-NEXT: vrepf %v1, %v1, 0
+; SZ13-NEXT: vmrhf %v0, %v1, %v2
+; SZ13-NEXT: vrepf %v1, %v3, 0
; SZ13-NEXT: vmrhg %v24, %v0, %v1
; SZ13-NEXT: br %r14
entry:
@@ -836,15 +836,15 @@ define void @constrained_vector_fadd_v3f64(ptr %a) #0 {
;
; SZ13-LABEL: constrained_vector_fadd_v3f64:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: larl %r1, .LCPI18_0
-; SZ13-NEXT: ld %f1, 0(%r1)
; SZ13-NEXT: larl %r1, .LCPI18_1
; SZ13-NEXT: vl %v0, 0(%r2), 4
-; SZ13-NEXT: vl %v2, 0(%r1), 3
-; SZ13-NEXT: adb %f1, 16(%r2)
-; SZ13-NEXT: vfadb %v0, %v2, %v0
+; SZ13-NEXT: larl %r3, .LCPI18_0
+; SZ13-NEXT: vl %v1, 0(%r1), 3
+; SZ13-NEXT: ld %f2, 0(%r3)
+; SZ13-NEXT: adb %f2, 16(%r2)
+; SZ13-NEXT: vfadb %v0, %v1, %v0
; SZ13-NEXT: vst %v0, 0(%r2), 4
-; SZ13-NEXT: std %f1, 16(%r2)
+; SZ13-NEXT: std %f2, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, ptr %a
@@ -879,13 +879,13 @@ define <4 x double> @constrained_vector_fadd_v4f64() #0 {
; SZ13-LABEL: constrained_vector_fadd_v4f64:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI19_0
+; SZ13-NEXT: larl %r2, .LCPI19_1
+; SZ13-NEXT: larl %r3, .LCPI19_2
; SZ13-NEXT: vl %v0, 0(%r1), 3
-; SZ13-NEXT: larl %r1, .LCPI19_1
-; SZ13-NEXT: vl %v1, 0(%r1), 3
-; SZ13-NEXT: larl %r1, .LCPI19_2
+; SZ13-NEXT: vl %v1, 0(%r2), 3
+; SZ13-NEXT: vl %v2, 0(%r3), 3
; SZ13-NEXT: vfadb %v26, %v1, %v0
-; SZ13-NEXT: vl %v0, 0(%r1), 3
-; SZ13-NEXT: vfadb %v24, %v1, %v0
+; SZ13-NEXT: vfadb %v24, %v1, %v2
; SZ13-NEXT: br %r14
entry:
%add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64(
@@ -968,17 +968,17 @@ define <3 x float> @constrained_vector_fsub_v3f32() #0 {
;
; SZ13-LABEL: constrained_vector_fsub_v3f32:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: vgbm %v2, 61440
-; SZ13-NEXT: lzer %f1
-; SZ13-NEXT: sebr %f2, %f1
-; SZ13-NEXT: vgmf %v1, 1, 1
-; SZ13-NEXT: vgbm %v3, 61440
; SZ13-NEXT: vgbm %v0, 61440
-; SZ13-NEXT: sebr %f3, %f1
-; SZ13-NEXT: vgmf %v1, 2, 8
-; SZ13-NEXT: sebr %f0, %f1
+; SZ13-NEXT: vgbm %v1, 61440
+; SZ13-NEXT: vgmf %v2, 1, 1
+; SZ13-NEXT: vgbm %v3, 61440
+; SZ13-NEXT: vgmf %v4, 2, 8
+; SZ13-NEXT: lzer %f5
+; SZ13-NEXT: sebr %f1, %f5
+; SZ13-NEXT: sebr %f3, %f2
+; SZ13-NEXT: sebr %f0, %f4
; SZ13-NEXT: vmrhf %v0, %v3, %v0
-; SZ13-NEXT: vrepf %v1, %v2, 0
+; SZ13-NEXT: vrepf %v1, %v1, 0
; SZ13-NEXT: vmrhg %v24, %v0, %v1
; SZ13-NEXT: br %r14
entry:
@@ -1009,12 +1009,12 @@ define void @constrained_vector_fsub_v3f64(ptr %a) #0 {
; SZ13-LABEL: constrained_vector_fsub_v3f64:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: vl %v0, 0(%r2), 4
-; SZ13-NEXT: vgmg %v2, 12, 10
-; SZ13-NEXT: sdb %f2, 16(%r2)
; SZ13-NEXT: vgmg %v1, 12, 10
-; SZ13-NEXT: vfsdb %v0, %v1, %v0
+; SZ13-NEXT: sdb %f1, 16(%r2)
+; SZ13-NEXT: vgmg %v2, 12, 10
+; SZ13-NEXT: vfsdb %v0, %v2, %v0
; SZ13-NEXT: vst %v0, 0(%r2), 4
-; SZ13-NEXT: std %f2, 16(%r2)
+; SZ13-NEXT: std %f1, 16(%r2)
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, ptr %a
@@ -1049,12 +1049,12 @@ define <4 x double> @constrained_vector_fsub_v4f64() #0 {
; SZ13-LABEL: constrained_vector_fsub_v4f64:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI24_0
+; SZ13-NEXT: larl %r2, .LCPI24_1
; SZ13-NEXT: vl %v0, 0(%r1), 3
-; SZ13-NEXT: vgmg %v1, 12, 10
-; SZ13-NEXT: larl %r1, .LCPI24_1
-; SZ13-NEXT: vfsdb %v26, %v1, %v0
-; SZ13-NEXT: vl %v0, 0(%r1), 3
-; SZ13-NEXT: vfsdb %v24, %v1, %v0
+; SZ13-NEXT: vl %v1, 0(%r2), 3
+; SZ13-NEXT: vgmg %v2, 12, 10
+; SZ13-NEXT: vfsdb %v26, %v2, %v0
+; SZ13-NEXT: vfsdb %v24, %v2, %v1
; SZ13-NEXT: br %r14
entry:
%sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64(
@@ -1126,11 +1126,11 @@ define <3 x float> @constrained_vector_sqrt_v3f32() #0 {
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI27_0
; SZ13-NEXT: sqeb %f0, 0(%r1)
-; SZ13-NEXT: larl %r1, .LCPI27_1
+; SZ13-NEXT: larl %r2, .LCPI27_1
+; SZ13-NEXT: larl %r3, .LCPI27_2
+; SZ13-NEXT: sqeb %f1, 0(%r2)
; SZ13-NEXT: vrepf %v0, %v0, 0
-; SZ13-NEXT: sqeb %f1, 0(%r1)
-; SZ13-NEXT: larl %r1, .LCPI27_2
-; SZ13-NEXT: sqeb %f2, 0(%r1)
+; SZ13-NEXT: sqeb %f2, 0(%r3)
; SZ13-NEXT: vmrhf %v1, %v1, %v2
; SZ13-NEXT: vmrhg %v24, %v1, %v0
; SZ13-NEXT: br %r14
@@ -1187,11 +1187,11 @@ define <4 x double> @constrained_vector_sqrt_v4f64() #0 {
; SZ13-LABEL: constrained_vector_sqrt_v4f64:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI29_0
+; SZ13-NEXT: larl %r2, .LCPI29_1
; SZ13-NEXT: vl %v0, 0(%r1), 3
; SZ13-NEXT: vfsqdb %v26, %v0
-; SZ13-NEXT: larl %r1, .LCPI29_1
-; SZ13-NEXT: vl %v0, 0(%r1), 3
-; SZ13-NEXT: vfsqdb %v24, %v0
+; SZ13-NEXT: vl %v1, 0(%r2), 3
+; SZ13-NEXT: vfsqdb %v24, %v1
; SZ13-NEXT: br %r14
entry:
%sqrt = call <4 x double> @llvm.experimental.constrained.sqrt.v4f64(
@@ -5306,9 +5306,9 @@ define <2 x float> @constrained_vector_fptrunc_v2f64() #0 {
; SZ13-LABEL: constrained_vector_fptrunc_v2f64:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI96_0
+; SZ13-NEXT: larl %r2, .LCPI96_1
; SZ13-NEXT: ld %f0, 0(%r1)
-; SZ13-NEXT: larl %r1, .LCPI96_1
-; SZ13-NEXT: ld %f1, 0(%r1)
+; SZ13-NEXT: ld %f1, 0(%r2)
; SZ13-NEXT: ledbra %f0, 0, %f0, 0
; SZ13-NEXT: ledbra %f1, 0, %f1, 0
; SZ13-NEXT: vmrhf %v0, %v1, %v0
@@ -5342,15 +5342,15 @@ define void @constrained_vector_fptrunc_v3f64(ptr %src, ptr %dest) #0 {
;
; SZ13-LABEL: constrained_vector_fptrunc_v3f64:
; SZ13: # %bb.0: # %entry
-; SZ13-NEXT: vl %v1, 0(%r2), 4
-; SZ13-NEXT: ld %f0, 16(%r2)
-; SZ13-NEXT: vledb %v1, %v1, 0, 0
+; SZ13-NEXT: vl %v0, 0(%r2), 4
+; SZ13-NEXT: ld %f1, 16(%r2)
+; SZ13-NEXT: vledb %v0, %v0, 0, 0
; SZ13-NEXT: larl %r1, .LCPI97_0
-; SZ13-NEXT: ledbra %f0, 0, %f0, 0
+; SZ13-NEXT: ledbra %f1, 0, %f1, 0
; SZ13-NEXT: vl %v2, 0(%r1), 3
-; SZ13-NEXT: vperm %v1, %v1, %v1, %v2
-; SZ13-NEXT: ste %f0, 8(%r3)
-; SZ13-NEXT: vsteg %v1, 0(%r3), 0
+; SZ13-NEXT: vperm %v0, %v0, %v0, %v2
+; SZ13-NEXT: ste %f1, 8(%r3)
+; SZ13-NEXT: vsteg %v0, 0(%r3), 0
; SZ13-NEXT: br %r14
entry:
%b = load <3 x double>, ptr %src
@@ -5382,19 +5382,19 @@ define <4 x float> @constrained_vector_fptrunc_v4f64() #0 {
; SZ13-LABEL: constrained_vector_fptrunc_v4f64:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI98_0
+; SZ13-NEXT: larl %r2, .LCPI98_1
+; SZ13-NEXT: larl %r3, .LCPI98_2
+; SZ13-NEXT: larl %r4, .LCPI98_3
; SZ13-NEXT: ld %f0, 0(%r1)
-; SZ13-NEXT: larl %r1, .LCPI98_1
-; SZ13-NEXT: ld %f1, 0(%r1)
+; SZ13-NEXT: ld %f1, 0(%r2)
+; SZ13-NEXT: ld %f2, 0(%r3)
+; SZ13-NEXT: ld %f3, 0(%r4)
; SZ13-NEXT: ledbra %f0, 0, %f0, 0
; SZ13-NEXT: ledbra %f1, 0, %f1, 0
-; SZ13-NEXT: larl %r1, .LCPI98_2
-; SZ13-NEXT: vmrhf %v0, %v1, %v0
-; SZ13-NEXT: ld %f1, 0(%r1)
-; SZ13-NEXT: larl %r1, .LCPI98_3
-; SZ13-NEXT: ld %f2, 0(%r1)
-; SZ13-NEXT: ledbra %f1, 0, %f1, 0
; SZ13-NEXT: ledbra %f2, 0, %f2, 0
-; SZ13-NEXT: vmrhf %v1, %v2, %v1
+; SZ13-NEXT: ledbra %f3, 0, %f3, 0
+; SZ13-NEXT: vmrhf %v0, %v1, %v0
+; SZ13-NEXT: vmrhf %v1, %v3, %v2
; SZ13-NEXT: vmrhg %v24, %v1, %v0
; SZ13-NEXT: br %r14
entry:
@@ -5501,15 +5501,15 @@ define <4 x double> @constrained_vector_fpext_v4f32() #0 {
; SZ13-LABEL: constrained_vector_fpext_v4f32:
; SZ13: # %bb.0: # %entry
; SZ13-NEXT: larl %r1, .LCPI102_0
+; SZ13-NEXT: larl %r2, .LCPI102_1
+; SZ13-NEXT: larl %r3, .LCPI102_2
+; SZ13-NEXT: larl %r4, .LCPI102_3
; SZ13-NEXT: ldeb %f0, 0(%r1)
-; SZ13-NEXT: larl %r1, .LCPI102_1
-; SZ13-NEXT: ldeb %f1, 0(%r1)
-; SZ13-NEXT: larl %r1, .LCPI102_2
+; SZ13-NEXT: ldeb %f1, 0(%r2)
+; SZ13-NEXT: ldeb %f2, 0(%r3)
+; SZ13-NEXT: ldeb %f3, 0(%r4)
; SZ13-NEXT: vmrhg %v24, %v1, %v0
-; SZ13-NEXT: ldeb %f0, 0(%r1)
-; SZ13-NEXT: larl %r1, .LCPI102_3
-; SZ13-NEXT: ldeb %f1, 0(%r1)
-; SZ13-NEXT: vmrhg %v26, %v1, %v0
+; SZ13-NEXT: vmrhg %v26, %v3, %v2
; SZ13-NEXT: br %r14
entry:
%result = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f32(
>From 5f9a41f3eeca19f4d8058fcd7b5c464b4c88306f Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Thu, 24 Apr 2025 12:40:06 +0200
Subject: [PATCH 2/2] Rebase. Updates per latest review.
---
llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp | 18 +-
llvm/lib/Target/SystemZ/SystemZInstrInfo.h | 2 +-
.../SystemZ/SystemZMachineScheduler.cpp | 72 +--
.../Target/SystemZ/SystemZMachineScheduler.h | 22 +-
.../Target/SystemZ/SystemZTargetMachine.cpp | 4 +-
llvm/test/CodeGen/SystemZ/fp-half-vector.ll | 419 +++++++++---------
llvm/test/CodeGen/SystemZ/fp-half.ll | 10 +-
llvm/test/CodeGen/SystemZ/int-mul-15.ll | 10 +-
llvm/test/CodeGen/SystemZ/vec-eval.ll | 12 +-
9 files changed, 290 insertions(+), 279 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 0e10cdab714f0..32e9d3336bc1b 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -2162,26 +2162,20 @@ bool SystemZInstrInfo::isLoadAndTestAsCmp(const MachineInstr &MI) const {
return (MI.getOpcode() == SystemZ::LTEBR ||
MI.getOpcode() == SystemZ::LTDBR ||
MI.getOpcode() == SystemZ::LTXBR) &&
- MI.getOperand(0).isDead();
+ MI.getOperand(0).isDead();
}
bool SystemZInstrInfo::isCompareZero(const MachineInstr &Compare) const {
if (isLoadAndTestAsCmp(Compare))
return true;
return Compare.isCompare() && Compare.getNumExplicitOperands() == 2 &&
- Compare.getOperand(1).isImm() && Compare.getOperand(1).getImm() == 0;
+ Compare.getOperand(1).isImm() && Compare.getOperand(1).getImm() == 0;
}
-unsigned SystemZInstrInfo::
-getCompareSourceReg(const MachineInstr &Compare) const {
- unsigned reg = 0;
- if (Compare.isCompare())
- reg = Compare.getOperand(0).getReg();
- else if (isLoadAndTestAsCmp(Compare))
- reg = Compare.getOperand(1).getReg();
- assert(reg);
-
- return reg;
+Register
+SystemZInstrInfo::getCompareSourceReg(const MachineInstr &Compare) const {
+ assert(isCompareZero(Compare) && "Expected a compare with 0.");
+ return Compare.getOperand(isLoadAndTestAsCmp(Compare) ? 1 : 0).getReg();
}
bool SystemZInstrInfo::
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 2030d52becc0e..05fe037e0cabe 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -365,7 +365,7 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
// Return the source register of Compare, which is the unknown value
// being tested.
- unsigned getCompareSourceReg(const MachineInstr &Compare) const;
+ Register getCompareSourceReg(const MachineInstr &Compare) const;
// Try to find all CC users of the compare instruction (MBBI) and update
// all of them to maintain equivalent behavior after swapping the compare
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index 85376ec70edc5..01c578df10800 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -57,15 +57,29 @@ void SystemZPreRASchedStrategy::initializePrioRegClasses(
}
}
-void SystemZPreRASchedStrategy::VRegSet::dump(std::string Msg) {
- dbgs() << Msg.c_str();
+void SystemZPreRASchedStrategy::VRegSet::insert(Register Reg) {
+ assert(Reg.isVirtual());
+ Regs.insert(Reg);
+}
+
+void SystemZPreRASchedStrategy::VRegSet::erase(Register Reg) {
+ assert(Reg.isVirtual());
+ Regs.erase(Reg);
+}
+
+bool SystemZPreRASchedStrategy::VRegSet::count(Register Reg) const {
+ assert(Reg.isVirtual());
+ return Regs.count(Reg);
+}
+
+void SystemZPreRASchedStrategy::VRegSet::dump() const {
bool First = true;
- for (auto R : *this) {
+ for (auto R : Regs) {
if (!First)
dbgs() << ", ";
else
First = false;
- dbgs() << "%" << R.virtRegIndex();
+ dbgs() << printReg(R);
}
dbgs() << "\n";
}
@@ -109,8 +123,8 @@ void SystemZPreRASchedStrategy::initializeStoresGroup() {
return;
if (IsStore)
StoresGroup.insert(SU);
- }
- else if (IsStore && !StoresGroup.empty() && SU->getDepth() == CurrMaxDepth) {
+ } else if (IsStore && !StoresGroup.empty() &&
+ SU->getDepth() == CurrMaxDepth) {
// The group members should all have the same opcode.
if ((*StoresGroup.begin())->getInstr()->getOpcode() != MI->getOpcode()) {
StoresGroup.clear();
@@ -142,9 +156,8 @@ static int biasPhysRegExtra(const SUnit *SU) {
return 0;
}
-int SystemZPreRASchedStrategy::
-computeSULivenessScore(SchedCandidate &C, ScheduleDAGMILive *DAG,
- SchedBoundary *Zone) const {
+int SystemZPreRASchedStrategy::computeSULivenessScore(
+ SchedCandidate &C, ScheduleDAGMILive *DAG, SchedBoundary *Zone) const {
// Not all data deps are modelled around the SUnit - some data edges near
// boundaries are missing: Look directly at the MI operands instead.
const SUnit *SU = C.SU;
@@ -246,22 +259,24 @@ bool SystemZPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
return TryCand.Reason != NoCand;
// Don't extend the scheduled latency.
- if (ShouldReduceLatency && TryCand.SU->getHeight() != Cand.SU->getHeight() &&
+ if (ShouldReduceLatency &&
+ TryCand.SU->getHeight() != Cand.SU->getHeight() &&
(std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
Zone->getScheduledLatency())) {
- unsigned HigherSUDepth = TryCand.SU->getHeight() < Cand.SU->getHeight() ?
- Cand.SU->getDepth() : TryCand.SU->getDepth();
+ unsigned HigherSUDepth = TryCand.SU->getHeight() < Cand.SU->getHeight()
+ ? Cand.SU->getDepth()
+ : TryCand.SU->getDepth();
if (HigherSUDepth != getRemLat(Zone) &&
- tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
- TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) {
+ tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, Cand,
+ GenericSchedulerBase::BotHeightReduce)) {
return TryCand.Reason != NoCand;
}
}
}
// Weak edges are for clustering and other constraints.
- if (tryLess(TryCand.SU->WeakSuccsLeft, Cand.SU->WeakSuccsLeft,
- TryCand, Cand, Weak))
+ if (tryLess(TryCand.SU->WeakSuccsLeft, Cand.SU->WeakSuccsLeft, TryCand, Cand,
+ Weak))
return TryCand.Reason != NoCand;
// Fall through to original instruction order.
@@ -361,17 +376,22 @@ void SystemZPreRASchedStrategy::initialize(ScheduleDAGMI *dag) {
LLVM_DEBUG(if (ShouldReduceLatency) dbgs() << "Latency scheduling enabled.\n";
else dbgs() << "Latency scheduling disabled.\n";);
- // Find the registers that are live at the bottom, before scheduling.
+ // Find the registers used in the region that are live out.
LiveRegs.clear();
- for (unsigned I = 0, E = DAG->MRI.getNumVirtRegs(); I != E; ++I) {
- Register VirtReg = Register::index2VirtReg(I);
- const LiveInterval &LI = DAG->getLIS()->getInterval(VirtReg);
- LiveQueryResult LRQ = LI.Query(
- DAG->getLIS()->getInstructionIndex(*DAG->SUnits.back().getInstr()));
- if (LRQ.valueOut())
- LiveRegs.insert(VirtReg);
+ std::set<Register> Visited;
+ for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+ const MachineInstr *MI = DAG->SUnits[Idx].getInstr();
+ for (auto &MO : MI->explicit_operands())
+ if (MO.isReg() && MO.getReg().isVirtual() &&
+ Visited.insert(MO.getReg()).second) {
+ const LiveInterval &LI = DAG->getLIS()->getInterval(MO.getReg());
+ LiveQueryResult LRQ = LI.Query(
+ DAG->getLIS()->getInstructionIndex(*DAG->SUnits.back().getInstr()));
+ if (LRQ.valueOut())
+ LiveRegs.insert(MO.getReg());
+ }
}
- LLVM_DEBUG(LiveRegs.dump("Live out at bottom: "););
+ LLVM_DEBUG(dbgs() << "Live out at bottom: "; LiveRegs.dump(););
// If MI uses the register it defines, record it one time here.
IsRedefining = std::vector<bool>(DAG->SUnits.size(), false);
@@ -395,7 +415,7 @@ void SystemZPreRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (TinyRegion)
return;
- LLVM_DEBUG(LiveRegs.dump("Live regs was: "););
+ LLVM_DEBUG(dbgs() << "Live regs was: "; LiveRegs.dump(););
if (!FirstStoreInGroupScheduled && StoresGroup.count(SU))
FirstStoreInGroupScheduled = true;
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
index 6219e70839bc9..6df2fcaab5572 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -50,16 +50,19 @@ class SystemZPreRASchedStrategy : public GenericScheduler {
// Num instructions left to schedule.
unsigned NumLeft;
- // Tru if latency scheduling is enabled.
+ // True if latency scheduling is enabled.
bool ShouldReduceLatency;
// Keep track of currently live registers.
- struct VRegSet : std::set<Register> {
- void dump(std::string Msg);
- size_type count(Register Reg) const {
- assert(Reg.isVirtual());
- return std::set<Register>::count(Reg);
- }
+ class VRegSet {
+ std::set<Register> Regs;
+
+ public:
+ void clear() { Regs.clear(); }
+ void insert(Register Reg);
+ void erase(Register Reg);
+ bool count(Register Reg) const;
+ void dump() const;
} LiveRegs;
// True if MI is also using the register it defines.
@@ -70,7 +73,7 @@ class SystemZPreRASchedStrategy : public GenericScheduler {
unsigned getRemLat(SchedBoundary *Zone) const;
// A large group of stores at the bottom is spread upwards.
- std::set<const SUnit*> StoresGroup;
+ std::set<const SUnit *> StoresGroup;
bool FirstStoreInGroupScheduled;
void initializeStoresGroup();
@@ -86,7 +89,8 @@ class SystemZPreRASchedStrategy : public GenericScheduler {
SchedBoundary *Zone) const override;
public:
- SystemZPreRASchedStrategy(const MachineSchedContext *C) : GenericScheduler(C) {
+ SystemZPreRASchedStrategy(const MachineSchedContext *C)
+ : GenericScheduler(C) {
initializePrioRegClasses(C->MF->getRegInfo().getTargetRegisterInfo());
}
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 70460c6197f29..5a72002aa3369 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -212,7 +212,7 @@ SystemZTargetMachine::getSubtargetImpl(const Function &F) const {
}
ScheduleDAGInstrs *
-SystemZTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
+SystemZTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
// Use GenericScheduler if requested on CL or for Z10, which has no sched
// model.
if (GenericSched ||
@@ -220,7 +220,7 @@ SystemZTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
return nullptr;
ScheduleDAGMILive *DAG =
- new ScheduleDAGMILive(C, std::make_unique<SystemZPreRASchedStrategy>(C));
+ new ScheduleDAGMILive(C, std::make_unique<SystemZPreRASchedStrategy>(C));
DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
index 4997c5b0c617d..588cc9d65ac46 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll
@@ -30,59 +30,59 @@ define <8 x half> @fun0(<8 x half> %Op) {
; NOVEC-NEXT: .cfi_offset %f13, -208
; NOVEC-NEXT: .cfi_offset %f14, -216
; NOVEC-NEXT: .cfi_offset %f15, -224
-; NOVEC-NEXT: lgh %r0, 414(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f14, %r0
-; NOVEC-NEXT: lgh %r0, 406(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f12, %r0
-; NOVEC-NEXT: lgh %r0, 398(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f9, %r0
+; NOVEC-NEXT: lgr %r13, %r2
; NOVEC-NEXT: lgh %r0, 390(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ler %f10, %f6
+; NOVEC-NEXT: lgh %r1, 414(%r15)
+; NOVEC-NEXT: lgh %r2, 406(%r15)
+; NOVEC-NEXT: lgh %r3, 398(%r15)
+; NOVEC-NEXT: sllg %r1, %r1, 48
+; NOVEC-NEXT: sllg %r2, %r2, 48
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: sllg %r3, %r3, 48
+; NOVEC-NEXT: ler %f9, %f6
; NOVEC-NEXT: ler %f11, %f4
-; NOVEC-NEXT: ler %f13, %f2
-; NOVEC-NEXT: ler %f15, %f0
-; NOVEC-NEXT: lgr %r13, %r2
+; NOVEC-NEXT: ler %f12, %f2
+; NOVEC-NEXT: ler %f14, %f0
; NOVEC-NEXT: ldgr %f0, %r0
; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: ldgr %f15, %r1
+; NOVEC-NEXT: ldgr %f13, %r2
+; NOVEC-NEXT: ldgr %f10, %r3
; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
; NOVEC-NEXT: aebr %f0, %f0
; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
; NOVEC-NEXT: ler %f8, %f0
-; NOVEC-NEXT: ler %f0, %f9
+; NOVEC-NEXT: ler %f0, %f10
; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
; NOVEC-NEXT: aebr %f0, %f0
; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f9, %f0
-; NOVEC-NEXT: ler %f0, %f12
+; NOVEC-NEXT: ler %f10, %f0
+; NOVEC-NEXT: ler %f0, %f13
; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
; NOVEC-NEXT: aebr %f0, %f0
; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f12, %f0
-; NOVEC-NEXT: ler %f0, %f14
+; NOVEC-NEXT: ler %f13, %f0
+; NOVEC-NEXT: ler %f0, %f15
; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
; NOVEC-NEXT: aebr %f0, %f0
; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f14, %f0
-; NOVEC-NEXT: ler %f0, %f15
+; NOVEC-NEXT: ler %f15, %f0
+; NOVEC-NEXT: ler %f0, %f14
; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
; NOVEC-NEXT: aebr %f0, %f0
; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f15, %f0
-; NOVEC-NEXT: ler %f0, %f13
+; NOVEC-NEXT: ler %f14, %f0
+; NOVEC-NEXT: ler %f0, %f12
; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
; NOVEC-NEXT: aebr %f0, %f0
; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
-; NOVEC-NEXT: ler %f13, %f0
+; NOVEC-NEXT: ler %f12, %f0
; NOVEC-NEXT: ler %f0, %f11
; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
; NOVEC-NEXT: aebr %f0, %f0
; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
; NOVEC-NEXT: ler %f11, %f0
-; NOVEC-NEXT: ler %f0, %f10
+; NOVEC-NEXT: ler %f0, %f9
; NOVEC-NEXT: brasl %r14, __extendhfsf2 at PLT
; NOVEC-NEXT: aebr %f0, %f0
; NOVEC-NEXT: brasl %r14, __truncsfhf2 at PLT
@@ -93,19 +93,19 @@ define <8 x half> @fun0(<8 x half> %Op) {
; NOVEC-NEXT: lgdr %r0, %f11
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 4(%r13)
-; NOVEC-NEXT: lgdr %r0, %f13
+; NOVEC-NEXT: lgdr %r0, %f12
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 2(%r13)
-; NOVEC-NEXT: lgdr %r0, %f15
+; NOVEC-NEXT: lgdr %r0, %f14
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 0(%r13)
-; NOVEC-NEXT: lgdr %r0, %f14
+; NOVEC-NEXT: lgdr %r0, %f15
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 14(%r13)
-; NOVEC-NEXT: lgdr %r0, %f12
+; NOVEC-NEXT: lgdr %r0, %f13
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 12(%r13)
-; NOVEC-NEXT: lgdr %r0, %f9
+; NOVEC-NEXT: lgdr %r0, %f10
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 10(%r13)
; NOVEC-NEXT: lgdr %r0, %f8
@@ -392,54 +392,60 @@ entry:
define void @fun3(ptr %Src, ptr %Dst) {
; NOVEC-LABEL: fun3:
; NOVEC: # %bb.0: # %entry
+; NOVEC-NEXT: stmg %r12, %r15, 96(%r15)
+; NOVEC-NEXT: .cfi_offset %r12, -64
+; NOVEC-NEXT: .cfi_offset %r13, -56
+; NOVEC-NEXT: .cfi_offset %r14, -48
+; NOVEC-NEXT: .cfi_offset %r15, -40
; NOVEC-NEXT: lgh %r0, 0(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: lgh %r1, 2(%r2)
+; NOVEC-NEXT: lgh %r4, 4(%r2)
+; NOVEC-NEXT: lgh %r5, 6(%r2)
+; NOVEC-NEXT: lgh %r14, 8(%r2)
+; NOVEC-NEXT: lgh %r13, 10(%r2)
+; NOVEC-NEXT: lgh %r12, 12(%r2)
+; NOVEC-NEXT: lgh %r2, 14(%r2)
+; NOVEC-NEXT: sllg %r2, %r2, 48
+; NOVEC-NEXT: ldgr %f0, %r2
+; NOVEC-NEXT: lgdr %r2, %f0
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: sth %r2, 14(%r3)
+; NOVEC-NEXT: sllg %r2, %r12, 48
+; NOVEC-NEXT: ldgr %f0, %r2
+; NOVEC-NEXT: lgdr %r2, %f0
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: sth %r2, 12(%r3)
+; NOVEC-NEXT: sllg %r2, %r13, 48
+; NOVEC-NEXT: ldgr %f0, %r2
+; NOVEC-NEXT: lgdr %r2, %f0
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: sth %r2, 10(%r3)
+; NOVEC-NEXT: sllg %r2, %r14, 48
+; NOVEC-NEXT: ldgr %f0, %r2
+; NOVEC-NEXT: lgdr %r2, %f0
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: sth %r2, 8(%r3)
+; NOVEC-NEXT: sllg %r2, %r5, 48
+; NOVEC-NEXT: ldgr %f0, %r2
+; NOVEC-NEXT: lgdr %r2, %f0
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: sth %r2, 6(%r3)
+; NOVEC-NEXT: sllg %r2, %r4, 48
+; NOVEC-NEXT: ldgr %f0, %r2
+; NOVEC-NEXT: sllg %r1, %r1, 48
+; NOVEC-NEXT: lgdr %r2, %f0
+; NOVEC-NEXT: ldgr %f0, %r1
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: lgdr %r1, %f0
; NOVEC-NEXT: ldgr %f0, %r0
-; NOVEC-NEXT: lgh %r0, 2(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f1, %r0
-; NOVEC-NEXT: lgh %r0, 4(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f2, %r0
-; NOVEC-NEXT: lgh %r0, 6(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f3, %r0
-; NOVEC-NEXT: lgh %r0, 8(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f4, %r0
-; NOVEC-NEXT: lgh %r0, 10(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f5, %r0
-; NOVEC-NEXT: lgh %r0, 12(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f6, %r0
-; NOVEC-NEXT: lgh %r0, 14(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f7, %r0
-; NOVEC-NEXT: lgdr %r0, %f7
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 14(%r3)
-; NOVEC-NEXT: lgdr %r0, %f6
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 12(%r3)
-; NOVEC-NEXT: lgdr %r0, %f5
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 10(%r3)
-; NOVEC-NEXT: lgdr %r0, %f4
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 8(%r3)
-; NOVEC-NEXT: lgdr %r0, %f3
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 6(%r3)
-; NOVEC-NEXT: lgdr %r0, %f2
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 4(%r3)
-; NOVEC-NEXT: lgdr %r0, %f1
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 2(%r3)
; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: srlg %r1, %r1, 48
; NOVEC-NEXT: srlg %r0, %r0, 48
+; NOVEC-NEXT: sth %r2, 4(%r3)
+; NOVEC-NEXT: sth %r1, 2(%r3)
; NOVEC-NEXT: sth %r0, 0(%r3)
+; NOVEC-NEXT: lmg %r12, %r15, 96(%r15)
; NOVEC-NEXT: br %r14
;
; VECTOR-LABEL: fun3:
@@ -472,104 +478,105 @@ declare <8 x half> @foo(<8 x half>)
define void @fun4(ptr %Src, ptr %Dst) {
; NOVEC-LABEL: fun4:
; NOVEC: # %bb.0: # %entry
-; NOVEC-NEXT: stmg %r13, %r15, 104(%r15)
+; NOVEC-NEXT: stmg %r12, %r15, 96(%r15)
+; NOVEC-NEXT: .cfi_offset %r12, -64
; NOVEC-NEXT: .cfi_offset %r13, -56
; NOVEC-NEXT: .cfi_offset %r14, -48
; NOVEC-NEXT: .cfi_offset %r15, -40
; NOVEC-NEXT: aghi %r15, -208
; NOVEC-NEXT: .cfi_def_cfa_offset 368
+; NOVEC-NEXT: lgr %r13, %r3
; NOVEC-NEXT: lgh %r0, 0(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: lgh %r1, 2(%r2)
+; NOVEC-NEXT: lgh %r3, 4(%r2)
+; NOVEC-NEXT: lgh %r4, 6(%r2)
+; NOVEC-NEXT: lgh %r5, 8(%r2)
+; NOVEC-NEXT: lgh %r14, 10(%r2)
+; NOVEC-NEXT: lgh %r12, 12(%r2)
+; NOVEC-NEXT: lgh %r2, 14(%r2)
+; NOVEC-NEXT: sllg %r2, %r2, 48
+; NOVEC-NEXT: ldgr %f0, %r2
+; NOVEC-NEXT: lgdr %r2, %f0
+; NOVEC-NEXT: sllg %r12, %r12, 48
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: ldgr %f0, %r12
+; NOVEC-NEXT: sth %r2, 190(%r15)
+; NOVEC-NEXT: lgdr %r2, %f0
+; NOVEC-NEXT: sllg %r14, %r14, 48
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: ldgr %f1, %r14
+; NOVEC-NEXT: sth %r2, 182(%r15)
+; NOVEC-NEXT: lgdr %r2, %f1
+; NOVEC-NEXT: sllg %r5, %r5, 48
+; NOVEC-NEXT: ldgr %f0, %r5
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: sth %r2, 174(%r15)
+; NOVEC-NEXT: lgdr %r2, %f0
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: sllg %r1, %r1, 48
+; NOVEC-NEXT: sllg %r3, %r3, 48
+; NOVEC-NEXT: sllg %r4, %r4, 48
+; NOVEC-NEXT: sth %r2, 166(%r15)
+; NOVEC-NEXT: la %r2, 192(%r15)
; NOVEC-NEXT: ldgr %f0, %r0
-; NOVEC-NEXT: lgh %r0, 2(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f2, %r0
; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d
+; NOVEC-NEXT: ldgr %f2, %r1
; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d
-; NOVEC-NEXT: lgh %r0, 4(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f4, %r0
+; NOVEC-NEXT: ldgr %f4, %r3
; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d
-; NOVEC-NEXT: lgh %r0, 6(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: ldgr %f6, %r4
; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d
-; NOVEC-NEXT: lgh %r0, 8(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f1, %r0
-; NOVEC-NEXT: lgh %r0, 10(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f3, %r0
-; NOVEC-NEXT: lgh %r0, 12(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f5, %r0
-; NOVEC-NEXT: lgh %r0, 14(%r2)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f7, %r0
-; NOVEC-NEXT: lgdr %r0, %f7
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 190(%r15)
-; NOVEC-NEXT: lgdr %r0, %f5
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 182(%r15)
-; NOVEC-NEXT: lgdr %r0, %f3
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 174(%r15)
-; NOVEC-NEXT: lgdr %r0, %f1
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: la %r2, 192(%r15)
-; NOVEC-NEXT: lgr %r13, %r3
-; NOVEC-NEXT: sth %r0, 166(%r15)
; NOVEC-NEXT: brasl %r14, foo at PLT
-; NOVEC-NEXT: lgh %r0, 192(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f0, %r0
-; NOVEC-NEXT: lgh %r0, 194(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f1, %r0
-; NOVEC-NEXT: lgh %r0, 196(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f2, %r0
-; NOVEC-NEXT: lgh %r0, 198(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f3, %r0
-; NOVEC-NEXT: lgh %r0, 200(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f4, %r0
-; NOVEC-NEXT: lgh %r0, 202(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f5, %r0
-; NOVEC-NEXT: lgh %r0, 204(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f6, %r0
; NOVEC-NEXT: lgh %r0, 206(%r15)
; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f7, %r0
-; NOVEC-NEXT: lgdr %r0, %f7
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 14(%r13)
-; NOVEC-NEXT: lgdr %r0, %f6
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 12(%r13)
-; NOVEC-NEXT: lgdr %r0, %f5
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 10(%r13)
-; NOVEC-NEXT: lgdr %r0, %f4
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 8(%r13)
-; NOVEC-NEXT: lgdr %r0, %f3
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 6(%r13)
-; NOVEC-NEXT: lgdr %r0, %f2
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 4(%r13)
-; NOVEC-NEXT: lgdr %r0, %f1
-; NOVEC-NEXT: srlg %r0, %r0, 48
-; NOVEC-NEXT: sth %r0, 2(%r13)
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lgh %r12, 204(%r15)
; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: srlg %r3, %r0, 48
+; NOVEC-NEXT: lgh %r0, 192(%r15)
+; NOVEC-NEXT: lgh %r1, 194(%r15)
+; NOVEC-NEXT: lgh %r2, 196(%r15)
+; NOVEC-NEXT: lgh %r4, 198(%r15)
+; NOVEC-NEXT: lgh %r5, 200(%r15)
+; NOVEC-NEXT: lgh %r14, 202(%r15)
+; NOVEC-NEXT: sth %r3, 14(%r13)
+; NOVEC-NEXT: sllg %r3, %r12, 48
+; NOVEC-NEXT: ldgr %f0, %r3
+; NOVEC-NEXT: lgdr %r3, %f0
+; NOVEC-NEXT: srlg %r3, %r3, 48
+; NOVEC-NEXT: sth %r3, 12(%r13)
+; NOVEC-NEXT: sllg %r3, %r14, 48
+; NOVEC-NEXT: ldgr %f0, %r3
+; NOVEC-NEXT: lgdr %r3, %f0
+; NOVEC-NEXT: srlg %r3, %r3, 48
+; NOVEC-NEXT: sth %r3, 10(%r13)
+; NOVEC-NEXT: sllg %r3, %r5, 48
+; NOVEC-NEXT: ldgr %f0, %r3
+; NOVEC-NEXT: lgdr %r3, %f0
+; NOVEC-NEXT: srlg %r3, %r3, 48
+; NOVEC-NEXT: sth %r3, 8(%r13)
+; NOVEC-NEXT: sllg %r3, %r4, 48
+; NOVEC-NEXT: ldgr %f0, %r3
+; NOVEC-NEXT: sllg %r2, %r2, 48
+; NOVEC-NEXT: lgdr %r3, %f0
+; NOVEC-NEXT: ldgr %f0, %r2
+; NOVEC-NEXT: sllg %r1, %r1, 48
+; NOVEC-NEXT: lgdr %r2, %f0
+; NOVEC-NEXT: ldgr %f0, %r1
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: lgdr %r1, %f0
+; NOVEC-NEXT: ldgr %f0, %r0
+; NOVEC-NEXT: lgdr %r0, %f0
+; NOVEC-NEXT: srlg %r3, %r3, 48
+; NOVEC-NEXT: srlg %r2, %r2, 48
+; NOVEC-NEXT: srlg %r1, %r1, 48
; NOVEC-NEXT: srlg %r0, %r0, 48
+; NOVEC-NEXT: sth %r3, 6(%r13)
+; NOVEC-NEXT: sth %r2, 4(%r13)
+; NOVEC-NEXT: sth %r1, 2(%r13)
; NOVEC-NEXT: sth %r0, 0(%r13)
-; NOVEC-NEXT: lmg %r13, %r15, 312(%r15)
+; NOVEC-NEXT: lmg %r12, %r15, 304(%r15)
; NOVEC-NEXT: br %r14
;
; VECTOR-LABEL: fun4:
@@ -628,70 +635,58 @@ define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) {
; NOVEC-NEXT: stmg %r14, %r15, 112(%r15)
; NOVEC-NEXT: .cfi_offset %r14, -48
; NOVEC-NEXT: .cfi_offset %r15, -40
-; NOVEC-NEXT: aghi %r15, -256
-; NOVEC-NEXT: .cfi_def_cfa_offset 416
-; NOVEC-NEXT: std %f8, 248(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f9, 240(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f10, 232(%r15) # 8-byte Spill
-; NOVEC-NEXT: std %f11, 224(%r15) # 8-byte Spill
-; NOVEC-NEXT: .cfi_offset %f8, -168
-; NOVEC-NEXT: .cfi_offset %f9, -176
-; NOVEC-NEXT: .cfi_offset %f10, -184
-; NOVEC-NEXT: .cfi_offset %f11, -192
-; NOVEC-NEXT: lgh %r0, 422(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f1, %r0
-; NOVEC-NEXT: lgh %r0, 430(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f3, %r0
-; NOVEC-NEXT: lgh %r0, 438(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f5, %r0
+; NOVEC-NEXT: aghi %r15, -224
+; NOVEC-NEXT: .cfi_def_cfa_offset 384
; NOVEC-NEXT: lgh %r0, 446(%r15)
; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f7, %r0
-; NOVEC-NEXT: lgh %r0, 454(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f8, %r0
-; NOVEC-NEXT: lgh %r0, 462(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f9, %r0
-; NOVEC-NEXT: lgh %r0, 470(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f10, %r0
-; NOVEC-NEXT: lgh %r0, 478(%r15)
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f11, %r0
-; NOVEC-NEXT: lgdr %r0, %f11
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lgdr %r0, %f1
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 222(%r15)
-; NOVEC-NEXT: lgdr %r0, %f10
+; NOVEC-NEXT: lgh %r0, 438(%r15)
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lgdr %r0, %f1
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 214(%r15)
-; NOVEC-NEXT: lgdr %r0, %f9
+; NOVEC-NEXT: lgh %r0, 430(%r15)
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lgdr %r0, %f1
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 206(%r15)
-; NOVEC-NEXT: lgdr %r0, %f8
+; NOVEC-NEXT: lgh %r0, 422(%r15)
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lgdr %r0, %f1
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 198(%r15)
-; NOVEC-NEXT: lgdr %r0, %f7
+; NOVEC-NEXT: lgh %r0, 414(%r15)
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lgdr %r0, %f1
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 190(%r15)
-; NOVEC-NEXT: lgdr %r0, %f5
+; NOVEC-NEXT: lgh %r0, 406(%r15)
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lgdr %r0, %f1
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 182(%r15)
-; NOVEC-NEXT: lgdr %r0, %f3
+; NOVEC-NEXT: lgh %r0, 398(%r15)
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: ldgr %f1, %r0
+; NOVEC-NEXT: lgdr %r0, %f1
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 174(%r15)
+; NOVEC-NEXT: lgh %r0, 390(%r15)
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: ldgr %f1, %r0
; NOVEC-NEXT: lgdr %r0, %f1
; NOVEC-NEXT: srlg %r0, %r0, 48
; NOVEC-NEXT: sth %r0, 166(%r15)
; NOVEC-NEXT: brasl %r14, foo2 at PLT
-; NOVEC-NEXT: ld %f8, 248(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f9, 240(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f10, 232(%r15) # 8-byte Reload
-; NOVEC-NEXT: ld %f11, 224(%r15) # 8-byte Reload
-; NOVEC-NEXT: lmg %r14, %r15, 368(%r15)
+; NOVEC-NEXT: lmg %r14, %r15, 336(%r15)
; NOVEC-NEXT: br %r14
;
; VECTOR-LABEL: fun5:
@@ -701,21 +696,21 @@ define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) {
; VECTOR-NEXT: .cfi_offset %r15, -40
; VECTOR-NEXT: aghi %r15, -224
; VECTOR-NEXT: .cfi_def_cfa_offset 384
+; VECTOR-NEXT: vlreph %v1, 446(%r15)
+; VECTOR-NEXT: vsteh %v1, 222(%r15), 0
+; VECTOR-NEXT: vlreph %v1, 438(%r15)
+; VECTOR-NEXT: vsteh %v1, 214(%r15), 0
+; VECTOR-NEXT: vlreph %v1, 430(%r15)
+; VECTOR-NEXT: vsteh %v1, 206(%r15), 0
+; VECTOR-NEXT: vlreph %v1, 422(%r15)
+; VECTOR-NEXT: vsteh %v1, 198(%r15), 0
+; VECTOR-NEXT: vlreph %v1, 414(%r15)
+; VECTOR-NEXT: vsteh %v1, 190(%r15), 0
+; VECTOR-NEXT: vlreph %v1, 406(%r15)
+; VECTOR-NEXT: vsteh %v1, 182(%r15), 0
+; VECTOR-NEXT: vlreph %v1, 398(%r15)
+; VECTOR-NEXT: vsteh %v1, 174(%r15), 0
; VECTOR-NEXT: vlreph %v1, 390(%r15)
-; VECTOR-NEXT: vlreph %v3, 398(%r15)
-; VECTOR-NEXT: vlreph %v5, 406(%r15)
-; VECTOR-NEXT: vlreph %v7, 414(%r15)
-; VECTOR-NEXT: vlreph %v16, 422(%r15)
-; VECTOR-NEXT: vlreph %v17, 430(%r15)
-; VECTOR-NEXT: vlreph %v18, 438(%r15)
-; VECTOR-NEXT: vlreph %v19, 446(%r15)
-; VECTOR-NEXT: vsteh %v19, 222(%r15), 0
-; VECTOR-NEXT: vsteh %v18, 214(%r15), 0
-; VECTOR-NEXT: vsteh %v17, 206(%r15), 0
-; VECTOR-NEXT: vsteh %v16, 198(%r15), 0
-; VECTOR-NEXT: vsteh %v7, 190(%r15), 0
-; VECTOR-NEXT: vsteh %v5, 182(%r15), 0
-; VECTOR-NEXT: vsteh %v3, 174(%r15), 0
; VECTOR-NEXT: vsteh %v1, 166(%r15), 0
; VECTOR-NEXT: brasl %r14, foo2 at PLT
; VECTOR-NEXT: lmg %r14, %r15, 336(%r15)
diff --git a/llvm/test/CodeGen/SystemZ/fp-half.ll b/llvm/test/CodeGen/SystemZ/fp-half.ll
index f479e405b04e9..c311152eadf51 100644
--- a/llvm/test/CodeGen/SystemZ/fp-half.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-half.ll
@@ -564,14 +564,14 @@ define void @fun11() {
; NOVEC-NEXT: aghi %r15, -160
; NOVEC-NEXT: .cfi_def_cfa_offset 320
; NOVEC-NEXT: lghrl %r0, .LCPI11_0
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f4, %r0
-; NOVEC-NEXT: lghrl %r0, .LCPI11_1
+; NOVEC-NEXT: lghrl %r1, .LCPI11_1
; NOVEC-NEXT: lzer %f2
+; NOVEC-NEXT: sllg %r0, %r0, 48
+; NOVEC-NEXT: sllg %r1, %r1, 48
; NOVEC-NEXT: lcdfr %f0, %f2
+; NOVEC-NEXT: ldgr %f4, %r0
; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d
-; NOVEC-NEXT: sllg %r0, %r0, 48
-; NOVEC-NEXT: ldgr %f6, %r0
+; NOVEC-NEXT: ldgr %f6, %r1
; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d
; NOVEC-NEXT: brasl %r14, foo2 at PLT
; NOVEC-NEXT: lmg %r14, %r15, 272(%r15)
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-15.ll b/llvm/test/CodeGen/SystemZ/int-mul-15.ll
index 01b7a21540491..b7d41412d9c5f 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-15.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-15.ll
@@ -43,10 +43,10 @@ define i64 @f3(i64 %dummy, i64 %a, i64 %b) {
; CHECK-NEXT: vgbm %v1, 0
; CHECK-NEXT: vlvgg %v1, %r3, 1
; CHECK-NEXT: vlvgg %v0, %r4, 1
-; CHECK-NEXT: vrepib %v2, 67
; CHECK-NEXT: vmlq %v0, %v1, %v0
-; CHECK-NEXT: vsrlb %v0, %v0, %v2
-; CHECK-NEXT: vsrl %v0, %v0, %v2
+; CHECK-NEXT: vrepib %v1, 67
+; CHECK-NEXT: vsrlb %v0, %v0, %v1
+; CHECK-NEXT: vsrl %v0, %v0, %v1
; CHECK-NEXT: vlgvg %r2, %v0, 1
; CHECK-NEXT: br %r14
%ax = zext i64 %a to i128
@@ -66,9 +66,9 @@ define i64 @f4(i64 %dummy, i64 %a, i64 %b) {
; CHECK-NEXT: vgbm %v1, 0
; CHECK-NEXT: vlvgg %v1, %r3, 1
; CHECK-NEXT: vlvgg %v0, %r4, 1
-; CHECK-NEXT: vrepib %v2, 64
; CHECK-NEXT: vmlq %v0, %v1, %v0
-; CHECK-NEXT: vsrlb %v1, %v0, %v2
+; CHECK-NEXT: vrepib %v1, 64
+; CHECK-NEXT: vsrlb %v1, %v0, %v1
; CHECK-NEXT: vo %v0, %v1, %v0
; CHECK-NEXT: vlgvg %r2, %v0, 1
; CHECK-NEXT: br %r14
diff --git a/llvm/test/CodeGen/SystemZ/vec-eval.ll b/llvm/test/CodeGen/SystemZ/vec-eval.ll
index f88531e82bc25..778d5703ef3b7 100644
--- a/llvm/test/CodeGen/SystemZ/vec-eval.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-eval.ll
@@ -1540,13 +1540,12 @@ define <16 x i8> @eval109(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
; CHECK-LABEL: eval109:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vgbm %v0, 65535
-; CHECK-NEXT: vn %v1, %v26, %v24
; CHECK-NEXT: veval %v0, %v24, %v0, %v26, 40
-; CHECK-NEXT: vo %v2, %v28, %v24
+; CHECK-NEXT: vn %v1, %v26, %v24
; CHECK-NEXT: veval %v0, %v28, %v0, %v1, 7
; CHECK-NEXT: vo %v1, %v28, %v24
; CHECK-NEXT: veval %v0, %v0, %v24, %v26, 47
-; CHECK-NEXT: veval %v24, %v0, %v26, %v2, 47
+; CHECK-NEXT: veval %v24, %v0, %v26, %v1, 47
; CHECK-NEXT: br %r14
entry:
%not = xor <16 x i8> %src1, splat(i8 -1)
@@ -1568,9 +1567,9 @@ define <16 x i8> @eval110(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
; CHECK-LABEL: eval110:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vno %v0, %v24, %v24
+; CHECK-NEXT: veval %v0, %v0, %v28, %v26, 2
; CHECK-NEXT: vgbm %v1, 65535
; CHECK-NEXT: vo %v2, %v28, %v24
-; CHECK-NEXT: veval %v0, %v0, %v28, %v26, 2
; CHECK-NEXT: veval %v0, %v0, %v26, %v2, 47
; CHECK-NEXT: veval %v1, %v26, %v1, %v28, 190
; CHECK-NEXT: veval %v24, %v0, %v1, %v24, 31
@@ -1754,13 +1753,12 @@ define <16 x i8> @eval121(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) {
; CHECK-LABEL: eval121:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vgbm %v0, 65535
-; CHECK-NEXT: vn %v1, %v26, %v24
; CHECK-NEXT: veval %v0, %v24, %v0, %v26, 40
-; CHECK-NEXT: vo %v2, %v28, %v26
+; CHECK-NEXT: vn %v1, %v26, %v24
; CHECK-NEXT: veval %v0, %v28, %v0, %v1, 7
; CHECK-NEXT: vo %v1, %v28, %v26
; CHECK-NEXT: veval %v0, %v0, %v26, %v24, 47
-; CHECK-NEXT: veval %v24, %v0, %v24, %v2, 47
+; CHECK-NEXT: veval %v24, %v0, %v24, %v1, 47
; CHECK-NEXT: br %r14
entry:
%not = xor <16 x i8> %src1, splat(i8 -1)
More information about the llvm-commits
mailing list