[llvm] cbf682c - [SystemZ] Improve codegen for memset.

Jonas Paulsson via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 6 10:12:15 PST 2021


Author: Jonas Paulsson
Date: 2021-12-06T12:10:58-06:00
New Revision: cbf682cb1c99c70fa93eb4e32aa5120e4881ba04

URL: https://github.com/llvm/llvm-project/commit/cbf682cb1c99c70fa93eb4e32aa5120e4881ba04
DIFF: https://github.com/llvm/llvm-project/commit/cbf682cb1c99c70fa93eb4e32aa5120e4881ba04.diff

LOG: [SystemZ] Improve codegen for memset.

Memset with a constant length was implemented with a single store followed by
a series of MVC:s. This patch changes this so that one store of the byte is
emitted for each MVC, which avoids data dependencies between the MVCs. An
MVI/STC + MVC(len-1) is done for each block.

In addition, memset with a variable length is now also handled without a
libcall. Since the byte is first stored and then MVC is used from that
address, a length of two must now be subtracted instead of one for the loop
and EXRL. This requires an extra check for the one-byte case, which is
handled in a special block with just a single MVI/STC (like GCC).

Review: Ulrich Weigand

Differential Revision: https://reviews.llvm.org/D112004

Added: 
    llvm/test/CodeGen/SystemZ/memset-07.ll

Modified: 
    llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
    llvm/lib/Target/SystemZ/SystemZISelLowering.h
    llvm/lib/Target/SystemZ/SystemZInstrFormats.td
    llvm/lib/Target/SystemZ/SystemZInstrInfo.td
    llvm/lib/Target/SystemZ/SystemZOperators.td
    llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
    llvm/test/CodeGen/SystemZ/memset-01.ll
    llvm/test/CodeGen/SystemZ/memset-02.ll
    llvm/test/CodeGen/SystemZ/memset-04.ll
    llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 71432218068e..a5e6c906453f 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -5714,6 +5714,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(OC);
     OPCODE(XC);
     OPCODE(CLC);
+    OPCODE(MEMSET_MVC);
     OPCODE(STPCPY);
     OPCODE(STRCMP);
     OPCODE(SEARCH_STRING);
@@ -7860,8 +7861,10 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
   return MBB;
 }
 
-MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
-    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+MachineBasicBlock *
+SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode, bool IsMemset) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
@@ -7870,18 +7873,64 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
 
   MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
   uint64_t DestDisp = MI.getOperand(1).getImm();
-  MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
-  uint64_t SrcDisp = MI.getOperand(3).getImm();
-  MachineOperand &LengthMO = MI.getOperand(4);
+  MachineOperand SrcBase = MachineOperand::CreateReg(0U, false);
+  uint64_t SrcDisp;
+
+  // Fold the displacement Disp if it is out of range.
+  auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> void {
+    if (!isUInt<12>(Disp)) {
+      Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp);
+      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg)
+        .add(Base).addImm(Disp).addReg(0);
+      Base = MachineOperand::CreateReg(Reg, false);
+      Disp = 0;
+    }
+  };
+
+  if (!IsMemset) {
+    SrcBase = earlyUseOperand(MI.getOperand(2));
+    SrcDisp = MI.getOperand(3).getImm();
+  } else {
+    SrcBase = DestBase;
+    SrcDisp = DestDisp++;
+    foldDisplIfNeeded(DestBase, DestDisp);
+  }
+
+  MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4);
   bool IsImmForm = LengthMO.isImm();
   bool IsRegForm = !IsImmForm;
 
+  // Build and insert one Opcode of Length, with special treatment for memset.
+  auto insertMemMemOp = [&](MachineBasicBlock *InsMBB,
+                            MachineBasicBlock::iterator InsPos,
+                            MachineOperand DBase, uint64_t DDisp,
+                            MachineOperand SBase, uint64_t SDisp,
+                            unsigned Length) -> void {
+    assert(Length > 0 && Length <= 256 && "Building memory op with bad length.");
+    if (IsMemset) {
+      MachineOperand ByteMO = earlyUseOperand(MI.getOperand(3));
+      if (ByteMO.isImm())
+        BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI))
+          .add(SBase).addImm(SDisp).add(ByteMO);
+      else
+        BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC))
+          .add(ByteMO).add(SBase).addImm(SDisp).addReg(0);
+      if (--Length == 0)
+        return;
+    }
+    BuildMI(*MBB, InsPos, DL, TII->get(Opcode))
+      .add(DBase).addImm(DDisp).addImm(Length)
+      .add(SBase).addImm(SDisp)
+      .setMemRefs(MI.memoperands());
+  };
+
   bool NeedsLoop = false;
   uint64_t ImmLength = 0;
-  Register LenMinus1Reg = SystemZ::NoRegister;
+  Register LenAdjReg = SystemZ::NoRegister;
   if (IsImmForm) {
     ImmLength = LengthMO.getImm();
-    ImmLength++; // Add back the '1' subtracted originally.
+    ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment.
     if (ImmLength == 0) {
       MI.eraseFromParent();
       return MBB;
@@ -7905,7 +7954,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
       NeedsLoop = true;
   } else {
     NeedsLoop = true;
-    LenMinus1Reg = LengthMO.getReg();
+    LenAdjReg = LengthMO.getReg();
   }
 
   // When generating more than one CLC, all but the last will need to
@@ -7923,17 +7972,17 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
       ImmLength &= 255;
     } else {
       BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
-        .addReg(LenMinus1Reg)
+        .addReg(LenAdjReg)
         .addReg(0)
         .addImm(8);
     }
 
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
     auto loadZeroAddress = [&]() -> MachineOperand {
       Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
       return MachineOperand::CreateReg(Reg, false);
     };
-    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
     if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
       DestBase = loadZeroAddress();
     if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
@@ -7968,14 +8017,41 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
       DoneMBB = SystemZ::emitBlockAfter(NextMBB);
 
       //  MBB:
-      //   # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB.
+      //   # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB.
       BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
-        .addReg(LenMinus1Reg).addImm(-1);
+        .addReg(LenAdjReg).addImm(IsMemset ? -2 : -1);
       BuildMI(MBB, DL, TII->get(SystemZ::BRC))
         .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
         .addMBB(AllDoneMBB);
       MBB->addSuccessor(AllDoneMBB);
-      MBB->addSuccessor(StartMBB);
+      if (!IsMemset)
+        MBB->addSuccessor(StartMBB);
+      else {
+        // MemsetOneCheckMBB:
+        // # Jump to MemsetOneMBB for a memset of length 1, or
+        // # fall thru to StartMBB.
+        MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB);
+        MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin());
+        MBB->addSuccessor(MemsetOneCheckMBB);
+        MBB = MemsetOneCheckMBB;
+        BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+          .addReg(LenAdjReg).addImm(-1);
+        BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+          .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
+          .addMBB(MemsetOneMBB);
+        MBB->addSuccessor(MemsetOneMBB, {10, 100});
+        MBB->addSuccessor(StartMBB, {90, 100});
+
+        // MemsetOneMBB:
+        // # Jump back to AllDoneMBB after a single MVI or STC.
+        MBB = MemsetOneMBB;
+        insertMemMemOp(MBB, MBB->end(),
+                       MachineOperand::CreateReg(StartDestReg, false), DestDisp,
+                       MachineOperand::CreateReg(StartSrcReg, false), SrcDisp,
+                       1);
+        BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB);
+        MBB->addSuccessor(AllDoneMBB);
+      }
 
       // StartMBB:
       // # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB.
@@ -8032,10 +8108,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     if (Opcode == SystemZ::MVC)
       BuildMI(MBB, DL, TII->get(SystemZ::PFD))
         .addImm(SystemZ::PFD_WRITE)
-        .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
-    BuildMI(MBB, DL, TII->get(Opcode))
-      .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
-      .addReg(ThisSrcReg).addImm(SrcDisp);
+        .addReg(ThisDestReg).addImm(DestDisp - IsMemset + 768).addReg(0);
+    insertMemMemOp(MBB, MBB->end(),
+                   MachineOperand::CreateReg(ThisDestReg, false), DestDisp,
+                   MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, 256);
     if (EndMBB) {
       BuildMI(MBB, DL, TII->get(SystemZ::BRC))
         .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
@@ -8075,7 +8151,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
       // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
       // # Use EXecute Relative Long for the remainder of the bytes. The target
       //   instruction of the EXRL will have a length field of 1 since 0 is an
-      //   illegal value. The number of bytes processed becomes (%LenMinus1Reg &
+      //   illegal value. The number of bytes processed becomes (%LenAdjReg &
       //   0xff) + 1.
       // # Fall through to AllDoneMBB.
       Register RemSrcReg  = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
@@ -8088,10 +8164,14 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
         BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
           .addReg(StartSrcReg).addMBB(StartMBB)
           .addReg(NextSrcReg).addMBB(NextMBB);
+      if (IsMemset)
+        insertMemMemOp(MBB, MBB->end(),
+                       MachineOperand::CreateReg(RemDestReg, false), DestDisp,
+                       MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, 1);
       MachineInstrBuilder EXRL_MIB =
         BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
           .addImm(Opcode)
-          .addReg(LenMinus1Reg)
+          .addReg(LenAdjReg)
           .addReg(RemDestReg).addImm(DestDisp)
           .addReg(RemSrcReg).addImm(SrcDisp);
       MBB->addSuccessor(AllDoneMBB);
@@ -8107,32 +8187,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
   while (ImmLength > 0) {
     uint64_t ThisLength = std::min(ImmLength, uint64_t(256));
     // The previous iteration might have created out-of-range displacements.
-    // Apply them using LAY if so.
-    if (!isUInt<12>(DestDisp)) {
-      Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
-      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-          .add(DestBase)
-          .addImm(DestDisp)
-          .addReg(0);
-      DestBase = MachineOperand::CreateReg(Reg, false);
-      DestDisp = 0;
-    }
-    if (!isUInt<12>(SrcDisp)) {
-      Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
-      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-          .add(SrcBase)
-          .addImm(SrcDisp)
-          .addReg(0);
-      SrcBase = MachineOperand::CreateReg(Reg, false);
-      SrcDisp = 0;
-    }
-    BuildMI(*MBB, MI, DL, TII->get(Opcode))
-        .add(DestBase)
-        .addImm(DestDisp)
-        .addImm(ThisLength)
-        .add(SrcBase)
-        .addImm(SrcDisp)
-        .setMemRefs(MI.memoperands());
+    // Apply them using LA/LAY if so.
+    foldDisplIfNeeded(DestBase, DestDisp);
+    foldDisplIfNeeded(SrcBase, SrcDisp);
+    insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength);
     DestDisp += ThisLength;
     SrcDisp += ThisLength;
     ImmLength -= ThisLength;
@@ -8630,6 +8688,11 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
   case SystemZ::CLCImm:
   case SystemZ::CLCReg:
     return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
+  case SystemZ::MemsetImmImm:
+  case SystemZ::MemsetImmReg:
+  case SystemZ::MemsetRegImm:
+  case SystemZ::MemsetRegReg:
+    return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/);
   case SystemZ::CLSTLoop:
     return emitStringWrapper(MI, MBB, SystemZ::CLST);
   case SystemZ::MVSTLoop:

diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 461f804ca55e..940c0a857ea4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -126,6 +126,9 @@ enum NodeType : unsigned {
   // as for MVC.
   CLC,
 
+  // Use MVC to set a block of memory after storing the first byte.
+  MEMSET_MVC,
+
   // Use an MVST-based sequence to implement stpcpy().
   STPCPY,
 
@@ -709,7 +712,8 @@ class SystemZTargetLowering : public TargetLowering {
   MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI,
                                         MachineBasicBlock *BB) const;
   MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB,
-                                       unsigned Opcode) const;
+                                       unsigned Opcode,
+                                       bool IsMemset = false) const;
   MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB,
                                        unsigned Opcode) const;
   MachineBasicBlock *emitTransactionBegin(MachineInstr &MI,

diff  --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index cd60fff1ab11..e513befd0d6f 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5256,6 +5256,16 @@ class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
   let Constraints = "$R1 = $R1src";
 }
 
+class MemsetPseudo<DAGOperand lenop, DAGOperand byteop>
+  : Pseudo<(outs), (ins bdaddr12only:$dest, lenop:$length, byteop:$B),
+           [(z_memset_mvc bdaddr12only:$dest, lenop:$length, byteop:$B)]> {
+  let Defs = [CC];
+  let mayLoad = 1;
+  let mayStore = 1;
+  let usesCustomInserter = 1;
+  let hasNoSchedulingInfo = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Multiclasses that emit both real and pseudo instructions
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index e4760229fd6b..84f1e0fb428c 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -510,6 +510,12 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in {
   def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
 }
 
+// Memset[Length][Byte] pseudos.
+def MemsetImmImm : MemsetPseudo<imm64, imm32zx8trunc>;
+def MemsetImmReg : MemsetPseudo<imm64, GR32>;
+def MemsetRegImm : MemsetPseudo<ADDR64, imm32zx8trunc>;
+def MemsetRegReg : MemsetPseudo<ADDR64, GR32>;
+
 // Move right.
 let Predicates = [FeatureMiscellaneousExtensions3],
     mayLoad = 1, mayStore = 1, Uses = [R0L] in

diff  --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 927d97233286..9935416559bc 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -102,6 +102,10 @@ def SDT_ZMemMemLengthCC     : SDTypeProfile<1, 3,
                                              SDTCisPtrTy<1>,
                                              SDTCisPtrTy<2>,
                                              SDTCisVT<3, i64>]>;
+def SDT_ZMemsetMVC          : SDTypeProfile<0, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisVT<1, i64>,
+                                             SDTCisVT<2, i32>]>;
 def SDT_ZString             : SDTypeProfile<1, 3,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
@@ -413,6 +417,8 @@ def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+def z_memset_mvc        : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
 def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,

diff  --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index f38e93109967..db4b4879b33a 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -17,29 +17,44 @@ using namespace llvm;
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) {
-  return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
-                               : DAG.getVTList(MVT::Other);
+static unsigned getMemMemLenAdj(unsigned Op) {
+  return Op == SystemZISD::MEMSET_MVC ? 2 : 1;
 }
 
-// Emit a mem-mem operation after subtracting one from size, which will be
-// added back during pseudo expansion. As the Reg case emitted here may be
-// converted by DAGCombiner into having an Imm length, they are both emitted
-// the same way.
+static SDValue createMemMemNode(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
+                                SDValue Chain, SDValue Dst, SDValue Src,
+                                SDValue LenAdj, SDValue Byte) {
+  SDVTList VTs = Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
+                                       : DAG.getVTList(MVT::Other);
+  SmallVector<SDValue, 6> Ops;
+  if (Op == SystemZISD::MEMSET_MVC)
+    Ops = { Chain, Dst, LenAdj, Byte };
+  else
+    Ops = { Chain, Dst, Src, LenAdj };
+  return DAG.getNode(Op, DL, VTs, Ops);
+}
+
+// Emit a mem-mem operation after subtracting one (or two for memset) from
+// size, which will be added back during pseudo expansion. As the Reg case
+// emitted here may be converted by DAGCombiner into having an Imm length,
+// they are both emitted the same way.
 static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
                              SDValue Chain, SDValue Dst, SDValue Src,
-                             uint64_t Size) {
-  return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src,
-                     DAG.getConstant(Size - 1, DL, Src.getValueType()));
+                             uint64_t Size, SDValue Byte = SDValue()) {
+  unsigned Adj = getMemMemLenAdj(Op);
+  assert(Size >= Adj && "Adjusted length overflow.");
+  SDValue LenAdj = DAG.getConstant(Size - Adj, DL, Dst.getValueType());
+  return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
 }
 
 static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
                              SDValue Chain, SDValue Dst, SDValue Src,
-                             SDValue Size) {
-  SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
-                                  DAG.getZExtOrTrunc(Size, DL, MVT::i64),
-                                  DAG.getConstant(-1, DL, MVT::i64));
-  return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1);
+                             SDValue Size, SDValue Byte = SDValue()) {
+  int64_t Adj = getMemMemLenAdj(Op);
+  SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64,
+                               DAG.getZExtOrTrunc(Size, DL, MVT::i64),
+                               DAG.getConstant(0 - Adj, DL, MVT::i64));
+  return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
 }
 
 SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
@@ -127,13 +142,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
     if (CByte && CByte->getZExtValue() == 0)
       return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes);
 
-    // Copy the byte to the first location and then use MVC to copy
-    // it to the rest.
-    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
-    SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
-                                   DAG.getConstant(1, DL, PtrVT));
-    return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst,
-                         Bytes - 1);
+    return emitMemMemImm(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
+                         Bytes, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
   }
 
   // Variable length
@@ -141,7 +151,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
     // Handle the special case of a variable length memset of 0 with XC.
     return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size);
 
-  return SDValue();
+  return emitMemMemReg(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
+                       Size, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
 }
 
 // Convert the current CC value into an integer that is 0 if CC == 0,

diff  --git a/llvm/test/CodeGen/SystemZ/memset-01.ll b/llvm/test/CodeGen/SystemZ/memset-01.ll
index 73b3ffa5b4a9..c621b2ee96de 100644
--- a/llvm/test/CodeGen/SystemZ/memset-01.ll
+++ b/llvm/test/CodeGen/SystemZ/memset-01.ll
@@ -87,7 +87,8 @@ define void @f8(i8* %dest, i8 %val) {
 define void @f9(i8* %dest, i8 %val) {
 ; CHECK-LABEL: f9:
 ; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 257, i1 false)
   ret void
@@ -97,7 +98,8 @@ define void @f9(i8* %dest, i8 %val) {
 define void @f10(i8* %dest, i8 %val) {
 ; CHECK-LABEL: f10:
 ; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 257, i1 false)
   ret void
@@ -107,7 +109,8 @@ define void @f10(i8* %dest, i8 %val) {
 define void @f11(i8* %dest, i8 %val) {
 ; CHECK-LABEL: f11:
 ; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
 ; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 258, i1 false)
@@ -118,7 +121,8 @@ define void @f11(i8* %dest, i8 %val) {
 define void @f12(i8* %dest, i8 %val) {
 ; CHECK-LABEL: f12:
 ; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
 ; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 258, i1 false)
@@ -129,30 +133,88 @@ define void @f12(i8* %dest, i8 %val) {
 define void @f13(i8* %dest, i8 %val) {
 ; CHECK-LABEL: f13:
 ; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
-; CHECK: mvc 257(256,%r2), 256(%r2)
-; CHECK: mvc 513(256,%r2), 512(%r2)
-; CHECK: mvc 769(256,%r2), 768(%r2)
-; CHECK: mvc 1025(256,%r2), 1024(%r2)
-; CHECK: mvc 1281(256,%r2), 1280(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
+; CHECK: mvc 257(255,%r2), 256(%r2)
+; CHECK: stc %r3, 512(%r2)
+; CHECK: mvc 513(255,%r2), 512(%r2)
+; CHECK: stc %r3, 768(%r2)
+; CHECK: mvc 769(255,%r2), 768(%r2)
+; CHECK: stc %r3, 1024(%r2)
+; CHECK: mvc 1025(255,%r2), 1024(%r2)
+; CHECK: stc %r3, 1280(%r2)
+; CHECK: mvc 1281(255,%r2), 1280(%r2)
 ; CHECK: br %r14
-  call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false)
+  call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1536, i1 false)
   ret void
 }
 
 ; Test the next size up, which uses a loop.  We leave the other corner
-; cases to memcpy-01.ll.
+; cases to memcpy-01.ll and memset-07.ll.
 define void @f14(i8* %dest, i8 %val) {
 ; CHECK-LABEL: f14:
-; CHECK: stc %r3, 0(%r2)
 ; CHECK: lghi [[COUNT:%r[0-5]]], 6
 ; CHECK: [[LABEL:\.L[^:]*]]:
-; CHECK: pfd 2, 769(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: pfd 2, 768(%r2)
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
 ; CHECK: la %r2, 256(%r2)
 ; CHECK: brctg [[COUNT]], [[LABEL]]
-; CHECK: mvc 1(1,%r2), 0(%r2)
-; CHECK: br %r14
-  call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false)
+; CHECK: stc %r3, 0(%r2)
+; CHECK-NEXT: br %r14
+  call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false)
   ret void
 }
+
+; Test (no) folding of displacement: Begins with max(uint12) - 1.
+define void @f15(i8* %dest, i8 %val) {
+; CHECK-LABEL: f15:
+; CHECK-NOT: la {{.*}}%r2
+  %addr = getelementptr i8, i8* %dest, i64 4094
+  call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false)
+  ret void
+}
+
+; Test folding of displacement: Begins with max(uint12).
+define void @f16(i8* %dest, i8 %val) {
+; CHECK-LABEL: f16:
+; CHECK-DAG: lay %r1, 4096(%r2)
+; CHECK-DAG: stc %r3, 4095(%r2)
+  %addr = getelementptr i8, i8* %dest, i64 4095
+  call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false)
+  ret void
+}
+
+; Test folding of displacement with LA: First two ops are in range.
+define void @f17(i8* %dest, i8 %val) {
+; CHECK-LABEL: f17:
+; CHECK:      stc %r3, 3583(%r2)
+; CHECK-NEXT: mvc 3584(255,%r2), 3583(%r2)
+; CHECK-NEXT: stc %r3, 3839(%r2)
+; CHECK-NEXT: mvc 3840(255,%r2), 3839(%r2)
+; CHECK-NEXT: lay %r1, 4096(%r2)
+; CHECK-NEXT: stc %r3, 4095(%r2)
+; CHECK-NEXT: mvc 0(1,%r1), 4095(%r2)
+; CHECK-NEXT: br %r14
+  %addr = getelementptr i8, i8* %dest, i64 3583
+  call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false)
+  ret void
+}
+
+; Test folding of displacement with LAY: First two ops are in range.
+define void @f18(i8* %dest, i8 %val) {
+; CHECK-LABEL: f18:
+; CHECK:      stc %r3, 3584(%r2)
+; CHECK-NEXT: mvc 3585(255,%r2), 3584(%r2)
+; CHECK-NEXT: stc %r3, 3840(%r2)
+; CHECK-NEXT: mvc 3841(255,%r2), 3840(%r2)
+; CHECK-NEXT: lay %r1, 4097(%r2)
+; CHECK-NEXT: lay %r2, 4096(%r2)
+; CHECK-NEXT: stc %r3, 0(%r2)
+; CHECK-NEXT: mvc 0(1,%r1), 0(%r2)
+; CHECK-NEXT: br %r14
+  %addr = getelementptr i8, i8* %dest, i64 3584
+  call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false)
+  ret void
+}
+

diff  --git a/llvm/test/CodeGen/SystemZ/memset-02.ll b/llvm/test/CodeGen/SystemZ/memset-02.ll
index 3f5ffca3398b..52dd6d2f48e5 100644
--- a/llvm/test/CodeGen/SystemZ/memset-02.ll
+++ b/llvm/test/CodeGen/SystemZ/memset-02.ll
@@ -123,7 +123,8 @@ define void @f12(i8* %dest) {
 define void @f13(i8* %dest) {
 ; CHECK-LABEL: f13:
 ; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 257, i1 false)
   ret void
@@ -133,7 +134,8 @@ define void @f13(i8* %dest) {
 define void @f14(i8* %dest) {
 ; CHECK-LABEL: f14:
 ; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 257, i1 false)
   ret void
@@ -143,7 +145,8 @@ define void @f14(i8* %dest) {
 define void @f15(i8* %dest) {
 ; CHECK-LABEL: f15:
 ; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
 ; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 258, i1 false)
@@ -154,7 +157,8 @@ define void @f15(i8* %dest) {
 define void @f16(i8* %dest) {
 ; CHECK-LABEL: f16:
 ; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
 ; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 258, i1 false)

diff  --git a/llvm/test/CodeGen/SystemZ/memset-04.ll b/llvm/test/CodeGen/SystemZ/memset-04.ll
index dcb8b6bad814..825d69c4b46e 100644
--- a/llvm/test/CodeGen/SystemZ/memset-04.ll
+++ b/llvm/test/CodeGen/SystemZ/memset-04.ll
@@ -359,7 +359,8 @@ define void @f36(i8* %dest) {
 define void @f37(i8* %dest) {
 ; CHECK-LABEL: f37:
 ; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 257, i1 false)
   ret void
@@ -369,7 +370,8 @@ define void @f37(i8* %dest) {
 define void @f38(i8* %dest) {
 ; CHECK-LABEL: f38:
 ; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 257, i1 false)
   ret void
@@ -379,7 +381,8 @@ define void @f38(i8* %dest) {
 define void @f39(i8* %dest) {
 ; CHECK-LABEL: f39:
 ; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
 ; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 258, i1 false)
@@ -390,7 +393,8 @@ define void @f39(i8* %dest) {
 define void @f40(i8* %dest) {
 ; CHECK-LABEL: f40:
 ; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
 ; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 258, i1 false)

diff  --git a/llvm/test/CodeGen/SystemZ/memset-07.ll b/llvm/test/CodeGen/SystemZ/memset-07.ll
new file mode 100644
index 000000000000..0d08298c8647
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/memset-07.ll
@@ -0,0 +1,100 @@
+; Test memset in cases where a loop is used.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @llvm.memset.p0i8.i32(i8 *nocapture, i8, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8 *nocapture, i8, i64, i1) nounwind
+
+; Constant length: 6 iterations and 2 bytes remainder.
+define void @f1(i8* %dest, i8 %val) {
+; CHECK-LABEL: f1:
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 768(%r2)
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: la %r2, 256(%r2)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: stc %r3, 0(%r2)
+; CHECK-NEXT: mvc 1(1,%r2), 0(%r2)
+; CHECK-NEXT: br %r14
+  call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false)
+  ret void
+}
+
+; Constant length: 6 iterations and 255 bytes remainder.
+define void @f2(i8* %dest) {
+; CHECK-LABEL: f2:
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 768(%r2)
+; CHECK: mvi  0(%r2), 1
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: la %r2, 256(%r2)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvi  0(%r2), 1
+; CHECK-NEXT: mvc 1(254,%r2), 0(%r2)
+; CHECK-NEXT: br %r14
+  call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 1791, i1 false)
+  ret void
+}
+
+; Variable length, byte in register.
+define void @f3(i8* %dest, i8 %val, i64 %Len) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: 	aghi	%r4, -2
+; CHECK-NEXT: 	cgibe	%r4, -2, 0(%r14)
+; CHECK-NEXT: .LBB2_1:
+; CHECK-NEXT:	cgije	%r4, -1, .LBB2_5
+; CHECK-NEXT:# %bb.2:
+; CHECK-NEXT:	srlg	%r0, %r4, 8
+; CHECK-NEXT:	cgije	%r0, 0, .LBB2_4
+; CHECK-NEXT:.LBB2_3:                   # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:	pfd	2, 768(%r2)
+; CHECK-NEXT:	stc	%r3, 0(%r2)
+; CHECK-NEXT:	mvc	1(255,%r2), 0(%r2)
+; CHECK-NEXT:	la	%r2, 256(%r2)
+; CHECK-NEXT:	brctg	%r0, .LBB2_3
+; CHECK-NEXT:.LBB2_4:
+; CHECK-NEXT:	stc	%r3, 0(%r2)
+; CHECK-NEXT:	exrl	%r4, .Ltmp0
+; CHECK-NEXT:	br	%r14
+; CHECK-NEXT:.LBB2_5:
+; CHECK-NEXT:	stc	%r3, 0(%r2)
+; CHECK-NEXT:	br	%r14
+  call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %Len, i1 false)
+  ret void
+}
+
+; Variable length, immediate byte.
+define void @f4(i8* %dest, i32 %Len) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT:	llgfr	%r1, %r3
+; CHECK-NEXT:	aghi	%r1, -2
+; CHECK-NEXT:	cgibe	%r1, -2, 0(%r14)
+; CHECK-NEXT:.LBB3_1:
+; CHECK-NEXT:	cgije	%r1, -1, .LBB3_5
+; CHECK-NEXT:# %bb.2:
+; CHECK-NEXT:	srlg	%r0, %r1, 8
+; CHECK-NEXT:	cgije	%r0, 0, .LBB3_4
+; CHECK-NEXT:.LBB3_3:                   # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:	pfd	2, 768(%r2)
+; CHECK-NEXT:	mvi	0(%r2), 1
+; CHECK-NEXT:	mvc	1(255,%r2), 0(%r2)
+; CHECK-NEXT:	la	%r2, 256(%r2)
+; CHECK-NEXT:	brctg	%r0, .LBB3_3
+; CHECK-NEXT:.LBB3_4:
+; CHECK-NEXT:	mvi	0(%r2), 1
+; CHECK-NEXT:	exrl	%r1, .Ltmp0
+; CHECK-NEXT:	br	%r14
+; CHECK-NEXT:.LBB3_5:
+; CHECK-NEXT:	mvi	0(%r2), 1
+; CHECK-NEXT:	br	%r14
+  call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 %Len, i1 false)
+  ret void
+}
+
+; CHECK: .Ltmp0:
+; CHECK-NEXT:	mvc	1(1,%r2), 0(%r2)

diff  --git a/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll b/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll
index 7ceab2795a68..6b3c979651c4 100644
--- a/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll
@@ -12,7 +12,7 @@ entry:
 ; CHECK: jg memset
 define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
 entry:
-  tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 false)
+  tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 true)
   ret void
 }
 


        


More information about the llvm-commits mailing list