[llvm] 37a92f3 - [SystemZ] Generate XC loop for memset 0 of variable length.

Jonas Paulsson via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 6 09:07:52 PDT 2021


Author: Jonas Paulsson
Date: 2021-07-06T18:07:31+02:00
New Revision: 37a92f3b03bf165245a9d0dc4830dcc6fed7c253

URL: https://github.com/llvm/llvm-project/commit/37a92f3b03bf165245a9d0dc4830dcc6fed7c253
DIFF: https://github.com/llvm/llvm-project/commit/37a92f3b03bf165245a9d0dc4830dcc6fed7c253.diff

LOG: [SystemZ]  Generate XC loop for memset 0 of variable length.

Benchmarking has shown that it is worthwhile to implement a variable length
memset of 0 with XC (exclusive or) like gcc does, instead of using a libcall.

This requires the use of the EXecute Relative Long (EXRL) instruction which
can now be done in a framework that can also be used with other target
instructions (not just XC).

Review: Ulrich Weigand

Differential Revision: https://reviews.llvm.org/D103865

Added: 
    llvm/test/CodeGen/SystemZ/memset-05.ll

Modified: 
    llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
    llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
    llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
    llvm/lib/Target/SystemZ/SystemZInstrFormats.td
    llvm/lib/Target/SystemZ/SystemZInstrInfo.td
    llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index a9d0ed030e5ae..5cded589c06a6 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -541,6 +541,30 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
     LowerPATCHPOINT(*MI, Lower);
     return;
 
+  case SystemZ::EXRL_Pseudo: {
+    unsigned TargetInsOpc = MI->getOperand(0).getImm();
+    Register LenMinus1Reg = MI->getOperand(1).getReg();
+    Register DestReg = MI->getOperand(2).getReg();
+    int64_t DestDisp = MI->getOperand(3).getImm();
+    Register SrcReg = MI->getOperand(4).getReg();
+    int64_t SrcDisp = MI->getOperand(5).getImm();
+
+    MCSymbol *DotSym = nullptr;
+    MCInst ET = MCInstBuilder(TargetInsOpc).addReg(DestReg)
+      .addImm(DestDisp).addImm(1).addReg(SrcReg).addImm(SrcDisp);
+    MCInstSTIPair ET_STI(ET, &MF->getSubtarget());
+    EXRLT2SymMap::iterator I = EXRLTargets2Sym.find(ET_STI);
+    if (I != EXRLTargets2Sym.end())
+      DotSym = I->second;
+    else
+      EXRLTargets2Sym[ET_STI] = DotSym = OutContext.createTempSymbol();
+    const MCSymbolRefExpr *Dot = MCSymbolRefExpr::create(DotSym, OutContext);
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(SystemZ::EXRL).addReg(LenMinus1Reg).addExpr(Dot));
+    return;
+  }
+
   default:
     Lower.lower(MI, LoweredMI);
     break;
@@ -698,6 +722,19 @@ void SystemZAsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
                             getSubtargetInfo());
 }
 
+void SystemZAsmPrinter::emitEXRLTargetInstructions() {
+  if (EXRLTargets2Sym.empty())
+    return;
+  // Switch to the .text section.
+  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+  for (auto &I : EXRLTargets2Sym) {
+    OutStreamer->emitLabel(I.second);
+    const MCInstSTIPair &MCI_STI = I.first;
+    OutStreamer->emitInstruction(MCI_STI.first, *MCI_STI.second);
+  }
+  EXRLTargets2Sym.clear();
+}
+
 // Convert a SystemZ-specific constant pool modifier into the associated
 // MCSymbolRefExpr variant kind.
 static MCSymbolRefExpr::VariantKind
@@ -746,6 +783,7 @@ bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 }
 
 void SystemZAsmPrinter::emitEndOfAsmFile(Module &M) {
+  emitEXRLTargetInstructions();
   emitStackMaps(SM);
 }
 

diff  --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
index 2d7562c7238da..11b731103c17c 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -9,10 +9,11 @@
 #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
 
-#include "SystemZTargetMachine.h"
 #include "SystemZMCInstLower.h"
+#include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
@@ -26,6 +27,33 @@ class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
 private:
   StackMaps SM;
 
+  typedef std::pair<MCInst, const MCSubtargetInfo *> MCInstSTIPair;
+  struct CmpMCInst {
+    bool operator()(const MCInstSTIPair &MCI_STI_A,
+                    const MCInstSTIPair &MCI_STI_B) const {
+      if (MCI_STI_A.second != MCI_STI_B.second)
+        return uintptr_t(MCI_STI_A.second) < uintptr_t(MCI_STI_B.second);
+      const MCInst &A = MCI_STI_A.first;
+      const MCInst &B = MCI_STI_B.first;
+      assert(A.getNumOperands() == B.getNumOperands() &&
+             A.getNumOperands() == 5 && A.getOperand(2).getImm() == 1 &&
+             B.getOperand(2).getImm() == 1 && "Unexpected EXRL target MCInst");
+      if (A.getOpcode() != B.getOpcode())
+        return A.getOpcode() < B.getOpcode();
+      if (A.getOperand(0).getReg() != B.getOperand(0).getReg())
+        return A.getOperand(0).getReg() < B.getOperand(0).getReg();
+      if (A.getOperand(1).getImm() != B.getOperand(1).getImm())
+        return A.getOperand(1).getImm() < B.getOperand(1).getImm();
+      if (A.getOperand(3).getReg() != B.getOperand(3).getReg())
+        return A.getOperand(3).getReg() < B.getOperand(3).getReg();
+      if (A.getOperand(4).getImm() != B.getOperand(4).getImm())
+        return A.getOperand(4).getImm() < B.getOperand(4).getImm();
+      return false;
+    }
+  };
+  typedef std::map<MCInstSTIPair, MCSymbol *, CmpMCInst> EXRLT2SymMap;
+  EXRLT2SymMap EXRLTargets2Sym;
+
 public:
   SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
       : AsmPrinter(TM, std::move(Streamer)), SM(*this) {}
@@ -49,6 +77,7 @@ class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter {
   void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL);
   void LowerSTACKMAP(const MachineInstr &MI);
   void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower);
+  void emitEXRLTargetInstructions();
 };
 } // end namespace llvm
 

diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 6d5a469c2e720..6b6fdcaf5630b 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -7795,43 +7795,89 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
   uint64_t DestDisp = MI.getOperand(1).getImm();
   MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
   uint64_t SrcDisp = MI.getOperand(3).getImm();
-  uint64_t Length = MI.getOperand(4).getImm();
+  MachineOperand &LengthMO = MI.getOperand(4);
+  uint64_t ImmLength = LengthMO.isImm() ? LengthMO.getImm() : 0;
+  Register LenMinus1Reg =
+      LengthMO.isReg() ? LengthMO.getReg() : SystemZ::NoRegister;
 
   // When generating more than one CLC, all but the last will need to
   // branch to the end when a 
diff erence is found.
-  MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
-                               SystemZ::splitBlockAfter(MI, MBB) : nullptr);
+  MachineBasicBlock *EndMBB = (ImmLength > 256 && Opcode == SystemZ::CLC
+                                   ? SystemZ::splitBlockAfter(MI, MBB)
+                                   : nullptr);
 
   // Check for the loop form, in which operand 5 is the trip count.
   if (MI.getNumExplicitOperands() > 5) {
-    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
-
     Register StartCountReg = MI.getOperand(5).getReg();
-    Register StartSrcReg   = forceReg(MI, SrcBase, TII);
-    Register StartDestReg  = (HaveSingleBase ? StartSrcReg :
-                              forceReg(MI, DestBase, TII));
+
+    MachineBasicBlock *StartMBB = nullptr;
+    MachineBasicBlock *LoopMBB = nullptr;
+    MachineBasicBlock *NextMBB = nullptr;
+    MachineBasicBlock *DoneMBB = nullptr;
+    MachineBasicBlock *AllDoneMBB = nullptr;
+
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+    Register StartSrcReg = forceReg(MI, SrcBase, TII);
+    Register StartDestReg =
+        (HaveSingleBase ? StartSrcReg : forceReg(MI, DestBase, TII));
 
     const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
     Register ThisSrcReg  = MRI.createVirtualRegister(RC);
-    Register ThisDestReg = (HaveSingleBase ? ThisSrcReg :
-                            MRI.createVirtualRegister(RC));
+    Register ThisDestReg =
+        (HaveSingleBase ? ThisSrcReg : MRI.createVirtualRegister(RC));
     Register NextSrcReg  = MRI.createVirtualRegister(RC);
-    Register NextDestReg = (HaveSingleBase ? NextSrcReg :
-                            MRI.createVirtualRegister(RC));
-
+    Register NextDestReg =
+        (HaveSingleBase ? NextSrcReg : MRI.createVirtualRegister(RC));
     RC = &SystemZ::GR64BitRegClass;
     Register ThisCountReg = MRI.createVirtualRegister(RC);
     Register NextCountReg = MRI.createVirtualRegister(RC);
 
-    MachineBasicBlock *StartMBB = MBB;
-    MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB);
-    MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB);
-    MachineBasicBlock *NextMBB =
-        (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB);
-
-    //  StartMBB:
-    //   # fall through to LoopMMB
-    MBB->addSuccessor(LoopMBB);
+    if (LengthMO.isReg()) {
+      AllDoneMBB = SystemZ::splitBlockBefore(MI, MBB);
+      StartMBB = SystemZ::emitBlockAfter(MBB);
+      LoopMBB = SystemZ::emitBlockAfter(StartMBB);
+      NextMBB = LoopMBB;
+      DoneMBB = SystemZ::emitBlockAfter(LoopMBB);
+
+      //  MBB:
+      //   # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB.
+      BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+        .addReg(LenMinus1Reg).addImm(-1);
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
+        .addMBB(AllDoneMBB);
+      MBB->addSuccessor(AllDoneMBB);
+      MBB->addSuccessor(StartMBB);
+
+      // StartMBB:
+      // # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB.
+      MBB = StartMBB;
+      BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+        .addReg(StartCountReg).addImm(0);
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
+        .addMBB(DoneMBB);
+      MBB->addSuccessor(DoneMBB);
+      MBB->addSuccessor(LoopMBB);
+    }
+    else {
+      StartMBB = MBB;
+      DoneMBB = SystemZ::splitBlockBefore(MI, MBB);
+      LoopMBB = SystemZ::emitBlockAfter(StartMBB);
+      NextMBB = (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB);
+
+      //  StartMBB:
+      //   # fall through to LoopMBB
+      MBB->addSuccessor(LoopMBB);
+
+      DestBase = MachineOperand::CreateReg(NextDestReg, false);
+      SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
+      ImmLength &= 255;
+      if (EndMBB && !ImmLength)
+        // If the loop handled the whole CLC range, DoneMBB will be empty with
+        // CC live-through into EndMBB, so add it as live-in.
+        DoneMBB->addLiveIn(SystemZ::CC);
+    }
 
     //  LoopMBB:
     //   %ThisDestReg = phi [ %StartDestReg, StartMBB ],
@@ -7846,7 +7892,6 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     //
     // The prefetch is used only for MVC.  The JLH is used only for CLC.
     MBB = LoopMBB;
-
     BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
       .addReg(StartDestReg).addMBB(StartMBB)
       .addReg(NextDestReg).addMBB(NextMBB);
@@ -7882,7 +7927,6 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     //
     // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
     MBB = NextMBB;
-
     BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
       .addReg(ThisDestReg).addImm(256).addReg(0);
     if (!HaveSingleBase)
@@ -7898,18 +7942,39 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     MBB->addSuccessor(LoopMBB);
     MBB->addSuccessor(DoneMBB);
 
-    DestBase = MachineOperand::CreateReg(NextDestReg, false);
-    SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
-    Length &= 255;
-    if (EndMBB && !Length)
-      // If the loop handled the whole CLC range, DoneMBB will be empty with
-      // CC live-through into EndMBB, so add it as live-in.
-      DoneMBB->addLiveIn(SystemZ::CC);
     MBB = DoneMBB;
+    if (LengthMO.isReg()) {
+      // DoneMBB:
+      // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
+      // # Use EXecute Relative Long for the remainder of the bytes. The target
+      //   instruction of the EXRL will have a length field of 1 since 0 is an
+      //   illegal value. The number of bytes processed becomes (%LenMinus1Reg &
+      //   0xff) + 1.
+      // # Fall through to AllDoneMBB.
+      Register RemSrcReg  = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      Register RemDestReg = HaveSingleBase ? RemSrcReg
+        : MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemDestReg)
+        .addReg(StartDestReg).addMBB(StartMBB)
+        .addReg(NextDestReg).addMBB(LoopMBB);
+      if (!HaveSingleBase)
+        BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
+          .addReg(StartSrcReg).addMBB(StartMBB)
+          .addReg(NextSrcReg).addMBB(LoopMBB);
+      MRI.constrainRegClass(LenMinus1Reg, &SystemZ::ADDR64BitRegClass);
+      BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
+        .addImm(Opcode)
+        .addReg(LenMinus1Reg)
+        .addReg(RemDestReg).addImm(DestDisp)
+        .addReg(RemSrcReg).addImm(SrcDisp);
+      MBB->addSuccessor(AllDoneMBB);
+      MBB = AllDoneMBB;
+    }
   }
+
   // Handle any remaining bytes with straight-line code.
-  while (Length > 0) {
-    uint64_t ThisLength = std::min(Length, uint64_t(256));
+  while (ImmLength > 0) {
+    uint64_t ThisLength = std::min(ImmLength, uint64_t(256));
     // The previous iteration might have created out-of-range displacements.
     // Apply them using LAY if so.
     if (!isUInt<12>(DestDisp)) {
@@ -7939,10 +8004,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
         .setMemRefs(MI.memoperands());
     DestDisp += ThisLength;
     SrcDisp += ThisLength;
-    Length -= ThisLength;
+    ImmLength -= ThisLength;
     // If there's another CLC to go, branch to the end if a 
diff erence
     // was found.
-    if (EndMBB && Length > 0) {
+    if (EndMBB && ImmLength > 0) {
       MachineBasicBlock *NextMBB = SystemZ::splitBlockBefore(MI, MBB);
       BuildMI(MBB, DL, TII->get(SystemZ::BRC))
         .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
@@ -8433,6 +8498,7 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
     return emitMemMemWrapper(MI, MBB, SystemZ::OC);
   case SystemZ::XCSequence:
   case SystemZ::XCLoop:
+  case SystemZ::XCLoopVarLen:
     return emitMemMemWrapper(MI, MBB, SystemZ::XC);
   case SystemZ::CLCSequence:
   case SystemZ::CLCLoop:

diff  --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 2c07509085040..bc2fe8e4c4c39 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5253,6 +5253,7 @@ multiclass CondUnaryRSYPseudoAndMemFold<string mnemonic,
 // The Sequence form uses a straight-line sequence of instructions and
 // the Loop form uses a loop of length-256 instructions followed by
 // another instruction to handle the excess.
+// The LoopVarLen form is for a loop with a non-constant length parameter.
 multiclass MemorySS<string mnemonic, bits<8> opcode,
                     SDPatternOperator sequence, SDPatternOperator loop> {
   def "" : SideEffectBinarySSa<mnemonic, opcode>;
@@ -5265,6 +5266,10 @@ multiclass MemorySS<string mnemonic, bits<8> opcode,
                                    imm64:$length, GR64:$count256),
                       [(loop bdaddr12only:$dest, bdaddr12only:$src,
                              imm64:$length, GR64:$count256)]>;
+    def LoopVarLen : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                         GR64:$length, GR64:$count256),
+                            [(loop bdaddr12only:$dest, bdaddr12only:$src,
+                                   GR64:$length, GR64:$count256)]>;
   }
 }
 

diff  --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 76d0381649c6a..ca095074001bb 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -2165,8 +2165,12 @@ let Predicates = [FeatureDeflateConversion],
 
 // Execute.
 let hasSideEffects = 1 in {
-  def EX   : SideEffectBinaryRX<"ex", 0x44, GR64>;
-  def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>;
+  def EX   : SideEffectBinaryRX<"ex", 0x44, ADDR64>;
+  def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, ADDR64>;
+  let hasNoSchedulingInfo = 1 in
+    def EXRL_Pseudo : Pseudo<(outs), (ins i64imm:$TargetOpc, ADDR64:$lenMinus1,
+                                          bdaddr12only:$bdl1, bdaddr12only:$bd2),
+                                          []>;
 }
 
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index ca5ca7257bab2..4a9ea69d101c2 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -81,11 +81,12 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
   if (IsVolatile)
     return SDValue();
 
+  auto *CByte = dyn_cast<ConstantSDNode>(Byte);
   if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
     uint64_t Bytes = CSize->getZExtValue();
     if (Bytes == 0)
       return SDValue();
-    if (auto *CByte = dyn_cast<ConstantSDNode>(Byte)) {
+    if (CByte) {
       // Handle cases that can be done using at most two of
       // MVI, MVHI, MVHHI and MVGHI.  The latter two can only be
       // used if ByteVal is all zeros or all ones; in other casees,
@@ -125,7 +126,6 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
     assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
 
     // Handle the special case of a memset of 0, which can use XC.
-    auto *CByte = dyn_cast<ConstantSDNode>(Byte);
     if (CByte && CByte->getZExtValue() == 0)
       return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
                         Chain, Dst, Dst, Bytes);
@@ -138,6 +138,18 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
     return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
                       Chain, DstPlus1, Dst, Bytes - 1);
   }
+
+  // Variable length
+  if (CByte && CByte->getZExtValue() == 0) {
+    // Handle the special case of a variable length memset of 0 with XC.
+    SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
+                                    DAG.getZExtOrTrunc(Size, DL, MVT::i64),
+                                    DAG.getConstant(-1, DL, MVT::i64));
+    SDValue TripC = DAG.getNode(ISD::SRL, DL, MVT::i64, LenMinus1,
+                                DAG.getConstant(8, DL, MVT::i64));
+    return DAG.getNode(SystemZISD::XC_LOOP, DL, MVT::Other, Chain, Dst, Dst,
+                       LenMinus1, TripC);
+  }
   return SDValue();
 }
 

diff  --git a/llvm/test/CodeGen/SystemZ/memset-05.ll b/llvm/test/CodeGen/SystemZ/memset-05.ll
new file mode 100644
index 0000000000000..780d107d7e594
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/memset-05.ll
@@ -0,0 +1,101 @@
+; Test memset 0 with variable length
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @fun0(i8* %Addr, i64 %Len) {
+; CHECK-LABEL: fun0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    aghi %r3, -1
+; CHECK-NEXT:    cgibe %r3, -1, 0(%r14)
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    srlg %r0, %r3, 8
+; CHECK-NEXT:    cgije %r0, 0, .LBB0_3
+; CHECK-NEXT:  .LBB0_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xc 0(256,%r2), 0(%r2)
+; CHECK-NEXT:    la %r2, 256(%r2)
+; CHECK-NEXT:    brctg %r0, .LBB0_2
+; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:    exrl %r3, .Ltmp0
+; CHECK-NEXT:    br %r14
+  tail call void @llvm.memset.p0i8.i64(i8* %Addr, i8 0, i64 %Len, i1 false)
+  ret void
+}
+
+define void @fun1(i8* %Addr, i32 %Len) {
+; CHECK-LABEL: fun1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgfr %r1, %r3
+; CHECK-NEXT:    aghi %r1, -1
+; CHECK-NEXT:    cgibe %r1, -1, 0(%r14)
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    srlg %r0, %r1, 8
+; CHECK-NEXT:    cgije %r0, 0, .LBB1_3
+; CHECK-NEXT:  .LBB1_2: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xc 0(256,%r2), 0(%r2)
+; CHECK-NEXT:    la %r2, 256(%r2)
+; CHECK-NEXT:    brctg %r0, .LBB1_2
+; CHECK-NEXT:  .LBB1_3:
+; CHECK-NEXT:    exrl %r1, .Ltmp0
+; CHECK-NEXT:    br %r14
+  tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false)
+  ret void
+}
+
+; Test that identical target instructions get reused.
+define void @fun2(i8* %Addr, i32 %Len) {
+; CHECK-LABEL: fun2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgfr %r1, %r3
+; CHECK-NEXT:    aghi %r1, -1
+; CHECK-NEXT:    srlg %r0, %r1, 8
+; CHECK-NEXT:    cgije %r1, -1, .LBB2_5
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    lgr %r3, %r2
+; CHECK-NEXT:    cgije %r0, 0, .LBB2_4
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    lgr %r3, %r2
+; CHECK-NEXT:    lgr %r4, %r0
+; CHECK-NEXT:  .LBB2_3: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xc 0(256,%r3), 0(%r3)
+; CHECK-NEXT:    la %r3, 256(%r3)
+; CHECK-NEXT:    brctg %r4, .LBB2_3
+; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    exrl %r1, .Ltmp1
+; CHECK-NEXT:  .LBB2_5:
+; CHECK-NEXT:    cgije %r1, -1, .LBB2_10
+; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    lgr %r3, %r2
+; CHECK-NEXT:    cgije %r0, 0, .LBB2_9
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    lgr %r3, %r2
+; CHECK-NEXT:    lgr %r4, %r0
+; CHECK-NEXT:  .LBB2_8: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xc 0(256,%r3), 0(%r3)
+; CHECK-NEXT:    la %r3, 256(%r3)
+; CHECK-NEXT:    brctg %r4, .LBB2_8
+; CHECK-NEXT:  .LBB2_9:
+; CHECK-NEXT:    exrl %r1, .Ltmp1
+; CHECK-NEXT:  .LBB2_10:
+; CHECK-NEXT:    cgibe %r1, -1, 0(%r14)
+; CHECK-NEXT:  .LBB2_11:
+; CHECK-NEXT:    cgije %r0, 0, .LBB2_13
+; CHECK-NEXT:  .LBB2_12: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xc 0(256,%r2), 0(%r2)
+; CHECK-NEXT:    la %r2, 256(%r2)
+; CHECK-NEXT:    brctg %r0, .LBB2_12
+; CHECK-NEXT:  .LBB2_13:
+; CHECK-NEXT:    exrl %r1, .Ltmp0
+; CHECK-NEXT:    br %r14
+  tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false)
+  tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false)
+  tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false)
+  ret void
+}
+
+; CHECK:       .Ltmp0:
+; CHECK-NEXT:    xc 0(1,%r2), 0(%r2)
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    xc 0(1,%r3), 0(%r3)
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)


        


More information about the llvm-commits mailing list