[llvm] cbf682c - [SystemZ] Improve codegen for memset.
Jonas Paulsson via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 6 10:12:15 PST 2021
Author: Jonas Paulsson
Date: 2021-12-06T12:10:58-06:00
New Revision: cbf682cb1c99c70fa93eb4e32aa5120e4881ba04
URL: https://github.com/llvm/llvm-project/commit/cbf682cb1c99c70fa93eb4e32aa5120e4881ba04
DIFF: https://github.com/llvm/llvm-project/commit/cbf682cb1c99c70fa93eb4e32aa5120e4881ba04.diff
LOG: [SystemZ] Improve codegen for memset.
Memset with a constant length was implemented with a single store followed by
a series of MVC:s. This patch changes this so that one store of the byte is
emitted for each MVC, which avoids data dependencies between the MVCs. An
MVI/STC + MVC(len-1) is done for each block.
In addition, memset with a variable length is now also handled without a
libcall. Since the byte is first stored and then MVC is used from that
address, a length of two must now be subtracted instead of one for the loop
and EXRL. This requires an extra check for the one-byte case, which is
handled in a special block with just a single MVI/STC (like GCC).
Review: Ulrich Weigand
Differential Revision: https://reviews.llvm.org/D112004
Added:
llvm/test/CodeGen/SystemZ/memset-07.ll
Modified:
llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
llvm/lib/Target/SystemZ/SystemZISelLowering.h
llvm/lib/Target/SystemZ/SystemZInstrFormats.td
llvm/lib/Target/SystemZ/SystemZInstrInfo.td
llvm/lib/Target/SystemZ/SystemZOperators.td
llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
llvm/test/CodeGen/SystemZ/memset-01.ll
llvm/test/CodeGen/SystemZ/memset-02.ll
llvm/test/CodeGen/SystemZ/memset-04.ll
llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 71432218068e..a5e6c906453f 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -5714,6 +5714,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(OC);
OPCODE(XC);
OPCODE(CLC);
+ OPCODE(MEMSET_MVC);
OPCODE(STPCPY);
OPCODE(STRCMP);
OPCODE(SEARCH_STRING);
@@ -7860,8 +7861,10 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
return MBB;
}
-MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
- MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+MachineBasicBlock *
+SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ unsigned Opcode, bool IsMemset) const {
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
@@ -7870,18 +7873,64 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
uint64_t DestDisp = MI.getOperand(1).getImm();
- MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
- uint64_t SrcDisp = MI.getOperand(3).getImm();
- MachineOperand &LengthMO = MI.getOperand(4);
+ MachineOperand SrcBase = MachineOperand::CreateReg(0U, false);
+ uint64_t SrcDisp;
+
+ // Fold the displacement Disp if it is out of range.
+ auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> void {
+ if (!isUInt<12>(Disp)) {
+ Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg)
+ .add(Base).addImm(Disp).addReg(0);
+ Base = MachineOperand::CreateReg(Reg, false);
+ Disp = 0;
+ }
+ };
+
+ if (!IsMemset) {
+ SrcBase = earlyUseOperand(MI.getOperand(2));
+ SrcDisp = MI.getOperand(3).getImm();
+ } else {
+ SrcBase = DestBase;
+ SrcDisp = DestDisp++;
+ foldDisplIfNeeded(DestBase, DestDisp);
+ }
+
+ MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4);
bool IsImmForm = LengthMO.isImm();
bool IsRegForm = !IsImmForm;
+ // Build and insert one Opcode of Length, with special treatment for memset.
+ auto insertMemMemOp = [&](MachineBasicBlock *InsMBB,
+ MachineBasicBlock::iterator InsPos,
+ MachineOperand DBase, uint64_t DDisp,
+ MachineOperand SBase, uint64_t SDisp,
+ unsigned Length) -> void {
+ assert(Length > 0 && Length <= 256 && "Building memory op with bad length.");
+ if (IsMemset) {
+ MachineOperand ByteMO = earlyUseOperand(MI.getOperand(3));
+ if (ByteMO.isImm())
+ BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI))
+ .add(SBase).addImm(SDisp).add(ByteMO);
+ else
+ BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC))
+ .add(ByteMO).add(SBase).addImm(SDisp).addReg(0);
+ if (--Length == 0)
+ return;
+ }
+ BuildMI(*MBB, InsPos, DL, TII->get(Opcode))
+ .add(DBase).addImm(DDisp).addImm(Length)
+ .add(SBase).addImm(SDisp)
+ .setMemRefs(MI.memoperands());
+ };
+
bool NeedsLoop = false;
uint64_t ImmLength = 0;
- Register LenMinus1Reg = SystemZ::NoRegister;
+ Register LenAdjReg = SystemZ::NoRegister;
if (IsImmForm) {
ImmLength = LengthMO.getImm();
- ImmLength++; // Add back the '1' subtracted originally.
+ ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment.
if (ImmLength == 0) {
MI.eraseFromParent();
return MBB;
@@ -7905,7 +7954,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
NeedsLoop = true;
} else {
NeedsLoop = true;
- LenMinus1Reg = LengthMO.getReg();
+ LenAdjReg = LengthMO.getReg();
}
// When generating more than one CLC, all but the last will need to
@@ -7923,17 +7972,17 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
ImmLength &= 255;
} else {
BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
- .addReg(LenMinus1Reg)
+ .addReg(LenAdjReg)
.addReg(0)
.addImm(8);
}
+ bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
auto loadZeroAddress = [&]() -> MachineOperand {
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
return MachineOperand::CreateReg(Reg, false);
};
- bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
DestBase = loadZeroAddress();
if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
@@ -7968,14 +8017,41 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
DoneMBB = SystemZ::emitBlockAfter(NextMBB);
// MBB:
- // # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB.
+ // # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB.
BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
- .addReg(LenMinus1Reg).addImm(-1);
+ .addReg(LenAdjReg).addImm(IsMemset ? -2 : -1);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
.addMBB(AllDoneMBB);
MBB->addSuccessor(AllDoneMBB);
- MBB->addSuccessor(StartMBB);
+ if (!IsMemset)
+ MBB->addSuccessor(StartMBB);
+ else {
+ // MemsetOneCheckMBB:
+ // # Jump to MemsetOneMBB for a memset of length 1, or
+ // # fall thru to StartMBB.
+ MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB);
+ MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin());
+ MBB->addSuccessor(MemsetOneCheckMBB);
+ MBB = MemsetOneCheckMBB;
+ BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+ .addReg(LenAdjReg).addImm(-1);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
+ .addMBB(MemsetOneMBB);
+ MBB->addSuccessor(MemsetOneMBB, {10, 100});
+ MBB->addSuccessor(StartMBB, {90, 100});
+
+ // MemsetOneMBB:
+ // # Jump back to AllDoneMBB after a single MVI or STC.
+ MBB = MemsetOneMBB;
+ insertMemMemOp(MBB, MBB->end(),
+ MachineOperand::CreateReg(StartDestReg, false), DestDisp,
+ MachineOperand::CreateReg(StartSrcReg, false), SrcDisp,
+ 1);
+ BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB);
+ MBB->addSuccessor(AllDoneMBB);
+ }
// StartMBB:
// # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB.
@@ -8032,10 +8108,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
if (Opcode == SystemZ::MVC)
BuildMI(MBB, DL, TII->get(SystemZ::PFD))
.addImm(SystemZ::PFD_WRITE)
- .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
- BuildMI(MBB, DL, TII->get(Opcode))
- .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
- .addReg(ThisSrcReg).addImm(SrcDisp);
+ .addReg(ThisDestReg).addImm(DestDisp - IsMemset + 768).addReg(0);
+ insertMemMemOp(MBB, MBB->end(),
+ MachineOperand::CreateReg(ThisDestReg, false), DestDisp,
+ MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, 256);
if (EndMBB) {
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
@@ -8075,7 +8151,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
// # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
// # Use EXecute Relative Long for the remainder of the bytes. The target
// instruction of the EXRL will have a length field of 1 since 0 is an
- // illegal value. The number of bytes processed becomes (%LenMinus1Reg &
+ // illegal value. The number of bytes processed becomes (%LenAdjReg &
// 0xff) + 1.
// # Fall through to AllDoneMBB.
Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
@@ -8088,10 +8164,14 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
.addReg(StartSrcReg).addMBB(StartMBB)
.addReg(NextSrcReg).addMBB(NextMBB);
+ if (IsMemset)
+ insertMemMemOp(MBB, MBB->end(),
+ MachineOperand::CreateReg(RemDestReg, false), DestDisp,
+ MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, 1);
MachineInstrBuilder EXRL_MIB =
BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
.addImm(Opcode)
- .addReg(LenMinus1Reg)
+ .addReg(LenAdjReg)
.addReg(RemDestReg).addImm(DestDisp)
.addReg(RemSrcReg).addImm(SrcDisp);
MBB->addSuccessor(AllDoneMBB);
@@ -8107,32 +8187,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
while (ImmLength > 0) {
uint64_t ThisLength = std::min(ImmLength, uint64_t(256));
// The previous iteration might have created out-of-range displacements.
- // Apply them using LAY if so.
- if (!isUInt<12>(DestDisp)) {
- Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
- BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
- .add(DestBase)
- .addImm(DestDisp)
- .addReg(0);
- DestBase = MachineOperand::CreateReg(Reg, false);
- DestDisp = 0;
- }
- if (!isUInt<12>(SrcDisp)) {
- Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
- BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
- .add(SrcBase)
- .addImm(SrcDisp)
- .addReg(0);
- SrcBase = MachineOperand::CreateReg(Reg, false);
- SrcDisp = 0;
- }
- BuildMI(*MBB, MI, DL, TII->get(Opcode))
- .add(DestBase)
- .addImm(DestDisp)
- .addImm(ThisLength)
- .add(SrcBase)
- .addImm(SrcDisp)
- .setMemRefs(MI.memoperands());
+ // Apply them using LA/LAY if so.
+ foldDisplIfNeeded(DestBase, DestDisp);
+ foldDisplIfNeeded(SrcBase, SrcDisp);
+ insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength);
DestDisp += ThisLength;
SrcDisp += ThisLength;
ImmLength -= ThisLength;
@@ -8630,6 +8688,11 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
case SystemZ::CLCImm:
case SystemZ::CLCReg:
return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
+ case SystemZ::MemsetImmImm:
+ case SystemZ::MemsetImmReg:
+ case SystemZ::MemsetRegImm:
+ case SystemZ::MemsetRegReg:
+ return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/);
case SystemZ::CLSTLoop:
return emitStringWrapper(MI, MBB, SystemZ::CLST);
case SystemZ::MVSTLoop:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 461f804ca55e..940c0a857ea4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -126,6 +126,9 @@ enum NodeType : unsigned {
// as for MVC.
CLC,
+ // Use MVC to set a block of memory after storing the first byte.
+ MEMSET_MVC,
+
// Use an MVST-based sequence to implement stpcpy().
STPCPY,
@@ -709,7 +712,8 @@ class SystemZTargetLowering : public TargetLowering {
MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB,
- unsigned Opcode) const;
+ unsigned Opcode,
+ bool IsMemset = false) const;
MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB,
unsigned Opcode) const;
MachineBasicBlock *emitTransactionBegin(MachineInstr &MI,
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index cd60fff1ab11..e513befd0d6f 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5256,6 +5256,16 @@ class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
let Constraints = "$R1 = $R1src";
}
+class MemsetPseudo<DAGOperand lenop, DAGOperand byteop>
+ : Pseudo<(outs), (ins bdaddr12only:$dest, lenop:$length, byteop:$B),
+ [(z_memset_mvc bdaddr12only:$dest, lenop:$length, byteop:$B)]> {
+ let Defs = [CC];
+ let mayLoad = 1;
+ let mayStore = 1;
+ let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
+}
+
//===----------------------------------------------------------------------===//
// Multiclasses that emit both real and pseudo instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index e4760229fd6b..84f1e0fb428c 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -510,6 +510,12 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in {
def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
}
+// Memset[Length][Byte] pseudos.
+def MemsetImmImm : MemsetPseudo<imm64, imm32zx8trunc>;
+def MemsetImmReg : MemsetPseudo<imm64, GR32>;
+def MemsetRegImm : MemsetPseudo<ADDR64, imm32zx8trunc>;
+def MemsetRegReg : MemsetPseudo<ADDR64, GR32>;
+
// Move right.
let Predicates = [FeatureMiscellaneousExtensions3],
mayLoad = 1, mayStore = 1, Uses = [R0L] in
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 927d97233286..9935416559bc 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -102,6 +102,10 @@ def SDT_ZMemMemLengthCC : SDTypeProfile<1, 3,
SDTCisPtrTy<1>,
SDTCisPtrTy<2>,
SDTCisVT<3, i64>]>;
+def SDT_ZMemsetMVC : SDTypeProfile<0, 3,
+ [SDTCisPtrTy<0>,
+ SDTCisVT<1, i64>,
+ SDTCisVT<2, i32>]>;
def SDT_ZString : SDTypeProfile<1, 3,
[SDTCisPtrTy<0>,
SDTCisPtrTy<1>,
@@ -413,6 +417,8 @@ def z_xc : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
[SDNPHasChain, SDNPMayLoad]>;
+def z_memset_mvc : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
[SDNPHasChain, SDNPMayLoad]>;
def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString,
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index f38e93109967..db4b4879b33a 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -17,29 +17,44 @@ using namespace llvm;
#define DEBUG_TYPE "systemz-selectiondag-info"
-static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) {
- return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
- : DAG.getVTList(MVT::Other);
+static unsigned getMemMemLenAdj(unsigned Op) {
+ return Op == SystemZISD::MEMSET_MVC ? 2 : 1;
}
-// Emit a mem-mem operation after subtracting one from size, which will be
-// added back during pseudo expansion. As the Reg case emitted here may be
-// converted by DAGCombiner into having an Imm length, they are both emitted
-// the same way.
+static SDValue createMemMemNode(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue LenAdj, SDValue Byte) {
+ SDVTList VTs = Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
+ : DAG.getVTList(MVT::Other);
+ SmallVector<SDValue, 6> Ops;
+ if (Op == SystemZISD::MEMSET_MVC)
+ Ops = { Chain, Dst, LenAdj, Byte };
+ else
+ Ops = { Chain, Dst, Src, LenAdj };
+ return DAG.getNode(Op, DL, VTs, Ops);
+}
+
+// Emit a mem-mem operation after subtracting one (or two for memset) from
+// size, which will be added back during pseudo expansion. As the Reg case
+// emitted here may be converted by DAGCombiner into having an Imm length,
+// they are both emitted the same way.
static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
SDValue Chain, SDValue Dst, SDValue Src,
- uint64_t Size) {
- return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src,
- DAG.getConstant(Size - 1, DL, Src.getValueType()));
+ uint64_t Size, SDValue Byte = SDValue()) {
+ unsigned Adj = getMemMemLenAdj(Op);
+ assert(Size >= Adj && "Adjusted length overflow.");
+ SDValue LenAdj = DAG.getConstant(Size - Adj, DL, Dst.getValueType());
+ return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
}
static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size) {
- SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
- DAG.getZExtOrTrunc(Size, DL, MVT::i64),
- DAG.getConstant(-1, DL, MVT::i64));
- return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1);
+ SDValue Size, SDValue Byte = SDValue()) {
+ int64_t Adj = getMemMemLenAdj(Op);
+ SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64,
+ DAG.getZExtOrTrunc(Size, DL, MVT::i64),
+ DAG.getConstant(0 - Adj, DL, MVT::i64));
+ return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
}
SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
@@ -127,13 +142,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
if (CByte && CByte->getZExtValue() == 0)
return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes);
- // Copy the byte to the first location and then use MVC to copy
- // it to the rest.
- Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
- SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
- DAG.getConstant(1, DL, PtrVT));
- return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst,
- Bytes - 1);
+ return emitMemMemImm(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
+ Bytes, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
}
// Variable length
@@ -141,7 +151,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
// Handle the special case of a variable length memset of 0 with XC.
return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size);
- return SDValue();
+ return emitMemMemReg(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
+ Size, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
}
// Convert the current CC value into an integer that is 0 if CC == 0,
diff --git a/llvm/test/CodeGen/SystemZ/memset-01.ll b/llvm/test/CodeGen/SystemZ/memset-01.ll
index 73b3ffa5b4a9..c621b2ee96de 100644
--- a/llvm/test/CodeGen/SystemZ/memset-01.ll
+++ b/llvm/test/CodeGen/SystemZ/memset-01.ll
@@ -87,7 +87,8 @@ define void @f8(i8* %dest, i8 %val) {
define void @f9(i8* %dest, i8 %val) {
; CHECK-LABEL: f9:
; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 257, i1 false)
ret void
@@ -97,7 +98,8 @@ define void @f9(i8* %dest, i8 %val) {
define void @f10(i8* %dest, i8 %val) {
; CHECK-LABEL: f10:
; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 257, i1 false)
ret void
@@ -107,7 +109,8 @@ define void @f10(i8* %dest, i8 %val) {
define void @f11(i8* %dest, i8 %val) {
; CHECK-LABEL: f11:
; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 258, i1 false)
@@ -118,7 +121,8 @@ define void @f11(i8* %dest, i8 %val) {
define void @f12(i8* %dest, i8 %val) {
; CHECK-LABEL: f12:
; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 258, i1 false)
@@ -129,30 +133,88 @@ define void @f12(i8* %dest, i8 %val) {
define void @f13(i8* %dest, i8 %val) {
; CHECK-LABEL: f13:
; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
-; CHECK: mvc 257(256,%r2), 256(%r2)
-; CHECK: mvc 513(256,%r2), 512(%r2)
-; CHECK: mvc 769(256,%r2), 768(%r2)
-; CHECK: mvc 1025(256,%r2), 1024(%r2)
-; CHECK: mvc 1281(256,%r2), 1280(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
+; CHECK: mvc 257(255,%r2), 256(%r2)
+; CHECK: stc %r3, 512(%r2)
+; CHECK: mvc 513(255,%r2), 512(%r2)
+; CHECK: stc %r3, 768(%r2)
+; CHECK: mvc 769(255,%r2), 768(%r2)
+; CHECK: stc %r3, 1024(%r2)
+; CHECK: mvc 1025(255,%r2), 1024(%r2)
+; CHECK: stc %r3, 1280(%r2)
+; CHECK: mvc 1281(255,%r2), 1280(%r2)
; CHECK: br %r14
- call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false)
+ call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1536, i1 false)
ret void
}
; Test the next size up, which uses a loop. We leave the other corner
-; cases to memcpy-01.ll.
+; cases to memcpy-01.ll and memset-07.ll.
define void @f14(i8* %dest, i8 %val) {
; CHECK-LABEL: f14:
-; CHECK: stc %r3, 0(%r2)
; CHECK: lghi [[COUNT:%r[0-5]]], 6
; CHECK: [[LABEL:\.L[^:]*]]:
-; CHECK: pfd 2, 769(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: pfd 2, 768(%r2)
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: la %r2, 256(%r2)
; CHECK: brctg [[COUNT]], [[LABEL]]
-; CHECK: mvc 1(1,%r2), 0(%r2)
-; CHECK: br %r14
- call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false)
+; CHECK: stc %r3, 0(%r2)
+; CHECK-NEXT: br %r14
+ call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false)
ret void
}
+
+; Test (no) folding of displacement: Begins with max(uint12) - 1.
+define void @f15(i8* %dest, i8 %val) {
+; CHECK-LABEL: f15:
+; CHECK-NOT: la {{.*}}%r2
+ %addr = getelementptr i8, i8* %dest, i64 4094
+ call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false)
+ ret void
+}
+
+; Test folding of displacement: Begins with max(uint12).
+define void @f16(i8* %dest, i8 %val) {
+; CHECK-LABEL: f16:
+; CHECK-DAG: lay %r1, 4096(%r2)
+; CHECK-DAG: stc %r3, 4095(%r2)
+ %addr = getelementptr i8, i8* %dest, i64 4095
+ call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false)
+ ret void
+}
+
+; Test folding of displacement with LA: First two ops are in range.
+define void @f17(i8* %dest, i8 %val) {
+; CHECK-LABEL: f17:
+; CHECK: stc %r3, 3583(%r2)
+; CHECK-NEXT: mvc 3584(255,%r2), 3583(%r2)
+; CHECK-NEXT: stc %r3, 3839(%r2)
+; CHECK-NEXT: mvc 3840(255,%r2), 3839(%r2)
+; CHECK-NEXT: lay %r1, 4096(%r2)
+; CHECK-NEXT: stc %r3, 4095(%r2)
+; CHECK-NEXT: mvc 0(1,%r1), 4095(%r2)
+; CHECK-NEXT: br %r14
+ %addr = getelementptr i8, i8* %dest, i64 3583
+ call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false)
+ ret void
+}
+
+; Test folding of displacement with LAY: First two ops are in range.
+define void @f18(i8* %dest, i8 %val) {
+; CHECK-LABEL: f18:
+; CHECK: stc %r3, 3584(%r2)
+; CHECK-NEXT: mvc 3585(255,%r2), 3584(%r2)
+; CHECK-NEXT: stc %r3, 3840(%r2)
+; CHECK-NEXT: mvc 3841(255,%r2), 3840(%r2)
+; CHECK-NEXT: lay %r1, 4097(%r2)
+; CHECK-NEXT: lay %r2, 4096(%r2)
+; CHECK-NEXT: stc %r3, 0(%r2)
+; CHECK-NEXT: mvc 0(1,%r1), 0(%r2)
+; CHECK-NEXT: br %r14
+ %addr = getelementptr i8, i8* %dest, i64 3584
+ call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false)
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/memset-02.ll b/llvm/test/CodeGen/SystemZ/memset-02.ll
index 3f5ffca3398b..52dd6d2f48e5 100644
--- a/llvm/test/CodeGen/SystemZ/memset-02.ll
+++ b/llvm/test/CodeGen/SystemZ/memset-02.ll
@@ -123,7 +123,8 @@ define void @f12(i8* %dest) {
define void @f13(i8* %dest) {
; CHECK-LABEL: f13:
; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 257, i1 false)
ret void
@@ -133,7 +134,8 @@ define void @f13(i8* %dest) {
define void @f14(i8* %dest) {
; CHECK-LABEL: f14:
; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 257, i1 false)
ret void
@@ -143,7 +145,8 @@ define void @f14(i8* %dest) {
define void @f15(i8* %dest) {
; CHECK-LABEL: f15:
; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 258, i1 false)
@@ -154,7 +157,8 @@ define void @f15(i8* %dest) {
define void @f16(i8* %dest) {
; CHECK-LABEL: f16:
; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 258, i1 false)
diff --git a/llvm/test/CodeGen/SystemZ/memset-04.ll b/llvm/test/CodeGen/SystemZ/memset-04.ll
index dcb8b6bad814..825d69c4b46e 100644
--- a/llvm/test/CodeGen/SystemZ/memset-04.ll
+++ b/llvm/test/CodeGen/SystemZ/memset-04.ll
@@ -359,7 +359,8 @@ define void @f36(i8* %dest) {
define void @f37(i8* %dest) {
; CHECK-LABEL: f37:
; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 257, i1 false)
ret void
@@ -369,7 +370,8 @@ define void @f37(i8* %dest) {
define void @f38(i8* %dest) {
; CHECK-LABEL: f38:
; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 257, i1 false)
ret void
@@ -379,7 +381,8 @@ define void @f38(i8* %dest) {
define void @f39(i8* %dest) {
; CHECK-LABEL: f39:
; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 258, i1 false)
@@ -390,7 +393,8 @@ define void @f39(i8* %dest) {
define void @f40(i8* %dest) {
; CHECK-LABEL: f40:
; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 258, i1 false)
diff --git a/llvm/test/CodeGen/SystemZ/memset-07.ll b/llvm/test/CodeGen/SystemZ/memset-07.ll
new file mode 100644
index 000000000000..0d08298c8647
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/memset-07.ll
@@ -0,0 +1,100 @@
+; Test memset in cases where a loop is used.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @llvm.memset.p0i8.i32(i8 *nocapture, i8, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8 *nocapture, i8, i64, i1) nounwind
+
+; Constant length: 6 iterations and 2 bytes remainder.
+define void @f1(i8* %dest, i8 %val) {
+; CHECK-LABEL: f1:
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 768(%r2)
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: la %r2, 256(%r2)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: stc %r3, 0(%r2)
+; CHECK-NEXT: mvc 1(1,%r2), 0(%r2)
+; CHECK-NEXT: br %r14
+ call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false)
+ ret void
+}
+
+; Constant length: 6 iterations and 255 bytes remainder.
+define void @f2(i8* %dest) {
+; CHECK-LABEL: f2:
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 768(%r2)
+; CHECK: mvi 0(%r2), 1
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: la %r2, 256(%r2)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvi 0(%r2), 1
+; CHECK-NEXT: mvc 1(254,%r2), 0(%r2)
+; CHECK-NEXT: br %r14
+ call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 1791, i1 false)
+ ret void
+}
+
+; Variable length, byte in register.
+define void @f3(i8* %dest, i8 %val, i64 %Len) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: aghi %r4, -2
+; CHECK-NEXT: cgibe %r4, -2, 0(%r14)
+; CHECK-NEXT: .LBB2_1:
+; CHECK-NEXT: cgije %r4, -1, .LBB2_5
+; CHECK-NEXT:# %bb.2:
+; CHECK-NEXT: srlg %r0, %r4, 8
+; CHECK-NEXT: cgije %r0, 0, .LBB2_4
+; CHECK-NEXT:.LBB2_3: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: pfd 2, 768(%r2)
+; CHECK-NEXT: stc %r3, 0(%r2)
+; CHECK-NEXT: mvc 1(255,%r2), 0(%r2)
+; CHECK-NEXT: la %r2, 256(%r2)
+; CHECK-NEXT: brctg %r0, .LBB2_3
+; CHECK-NEXT:.LBB2_4:
+; CHECK-NEXT: stc %r3, 0(%r2)
+; CHECK-NEXT: exrl %r4, .Ltmp0
+; CHECK-NEXT: br %r14
+; CHECK-NEXT:.LBB2_5:
+; CHECK-NEXT: stc %r3, 0(%r2)
+; CHECK-NEXT: br %r14
+ call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %Len, i1 false)
+ ret void
+}
+
+; Variable length, immediate byte.
+define void @f4(i8* %dest, i32 %Len) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: llgfr %r1, %r3
+; CHECK-NEXT: aghi %r1, -2
+; CHECK-NEXT: cgibe %r1, -2, 0(%r14)
+; CHECK-NEXT:.LBB3_1:
+; CHECK-NEXT: cgije %r1, -1, .LBB3_5
+; CHECK-NEXT:# %bb.2:
+; CHECK-NEXT: srlg %r0, %r1, 8
+; CHECK-NEXT: cgije %r0, 0, .LBB3_4
+; CHECK-NEXT:.LBB3_3: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: pfd 2, 768(%r2)
+; CHECK-NEXT: mvi 0(%r2), 1
+; CHECK-NEXT: mvc 1(255,%r2), 0(%r2)
+; CHECK-NEXT: la %r2, 256(%r2)
+; CHECK-NEXT: brctg %r0, .LBB3_3
+; CHECK-NEXT:.LBB3_4:
+; CHECK-NEXT: mvi 0(%r2), 1
+; CHECK-NEXT: exrl %r1, .Ltmp0
+; CHECK-NEXT: br %r14
+; CHECK-NEXT:.LBB3_5:
+; CHECK-NEXT: mvi 0(%r2), 1
+; CHECK-NEXT: br %r14
+ call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 %Len, i1 false)
+ ret void
+}
+
+; CHECK: .Ltmp0:
+; CHECK-NEXT: mvc 1(1,%r2), 0(%r2)
diff --git a/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll b/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll
index 7ceab2795a68..6b3c979651c4 100644
--- a/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll
@@ -12,7 +12,7 @@ entry:
; CHECK: jg memset
define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
entry:
- tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 false)
+ tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 true)
ret void
}
More information about the llvm-commits
mailing list