[llvm] 00baad3 - [SystemZ] Bugfix and refactorization of mem-mem operations

Thu Oct 14 01:38:08 PDT 2021

Author: Jonas Paulsson
Date: 2021-10-14T10:37:33+02:00
New Revision: 00baad35b2a366437e210a0701173247a3ad5468

URL: https://github.com/llvm/llvm-project/commit/00baad35b2a366437e210a0701173247a3ad5468
DIFF: https://github.com/llvm/llvm-project/commit/00baad35b2a366437e210a0701173247a3ad5468.diff

LOG: [SystemZ] Bugfix and refactorization of mem-mem operations

This patch fixes the bug that consisted of treating variable / immediate
length mem operations (such as memcpy, memset, ...) differently. The variable
length case needs to have the length minus 1 passed due to the use of EXRL
target instructions. However, the DAGCombiner can convert a register length
argument into a constant one, and whenever that happened one byte too little
would end up being performed.

This is also a refactorization by reducing the number of opcodes and variants
involved. For any opcode (variable or constant length), only the length minus
one is passed on to the ISD node. The rest of the logic is now instead
handled during isel pseudo expansion.

Review: Ulrich Weigand

Differential Revision: https://reviews.llvm.org/D111729

Added: 
    

Modified: 
    llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
    llvm/lib/Target/SystemZ/SystemZInstrFP.td
    llvm/lib/Target/SystemZ/SystemZInstrFormats.td
    llvm/lib/Target/SystemZ/SystemZInstrInfo.td
    llvm/lib/Target/SystemZ/SystemZOperators.td
    llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
    llvm/test/CodeGen/SystemZ/memset-05.ll
    llvm/test/CodeGen/SystemZ/mverify-optypes.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index e65dfea15f0c6..564dd78f4548b 100644

--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -7830,9 +7830,40 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
   MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
   uint64_t SrcDisp = MI.getOperand(3).getImm();
   MachineOperand &LengthMO = MI.getOperand(4);
-  uint64_t ImmLength = LengthMO.isImm() ? LengthMO.getImm() : 0;
-  Register LenMinus1Reg =
-      LengthMO.isReg() ? LengthMO.getReg() : SystemZ::NoRegister;
+  bool IsImmForm = LengthMO.isImm();
+  bool IsRegForm = !IsImmForm;
+
+  bool NeedsLoop = false;
+  uint64_t ImmLength = 0;
+  Register LenMinus1Reg = SystemZ::NoRegister;
+  if (IsImmForm) {
+    ImmLength = LengthMO.getImm();
+    ImmLength++; // Add back the '1' subtracted originally.
+    if (ImmLength == 0) {
+      MI.eraseFromParent();
+      return MBB;
+    }
+    if (Opcode == SystemZ::CLC) {
+      if (ImmLength > 3 * 256)
+        // A two-CLC sequence is a clear win over a loop, not least because
+        // it needs only one branch.  A three-CLC sequence needs the same
+        // number of branches as a loop (i.e. 2), but is shorter.  That
+        // brings us to lengths greater than 768 bytes.  It seems relatively
+        // likely that a 
diff erence will be found within the first 768 bytes,
+        // so we just optimize for the smallest number of branch
+        // instructions, in order to avoid polluting the prediction buffer
+        // too much.
+        NeedsLoop = true;
+    } else if (ImmLength > 6 * 256)
+      // The heuristic we use is to prefer loops for anything that would
+      // require 7 or more MVCs.  With these kinds of sizes there isn't much
+      // to choose between straight-line code and looping code, since the
+      // time will be dominated by the MVCs themselves.
+      NeedsLoop = true;
+  } else {
+    NeedsLoop = true;
+    LenMinus1Reg = LengthMO.getReg();
+  }
 
   // When generating more than one CLC, all but the last will need to
   // branch to the end when a 
diff erence is found.
@@ -7840,16 +7871,25 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
                                    ? SystemZ::splitBlockAfter(MI, MBB)
                                    : nullptr);
 
-  // Check for the loop form, in which operand 5 is the trip count.
-  if (MI.getNumExplicitOperands() > 5) {
-    Register StartCountReg = MI.getOperand(5).getReg();
-    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+  if (NeedsLoop) {
+    Register StartCountReg =
+      MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+    if (IsImmForm) {
+      TII->loadImmediate(*MBB, MI, StartCountReg, ImmLength / 256);
+      ImmLength &= 255;
+    } else {
+      BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
+        .addReg(LenMinus1Reg)
+        .addReg(0)
+        .addImm(8);
+    }
 
     auto loadZeroAddress = [&]() -> MachineOperand {
       Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
       return MachineOperand::CreateReg(Reg, false);
     };
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
     if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
       DestBase = loadZeroAddress();
     if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
@@ -7876,7 +7916,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     Register ThisCountReg = MRI.createVirtualRegister(RC);
     Register NextCountReg = MRI.createVirtualRegister(RC);
 
-    if (LengthMO.isReg()) {
+    if (IsRegForm) {
       AllDoneMBB = SystemZ::splitBlockBefore(MI, MBB);
       StartMBB = SystemZ::emitBlockAfter(MBB);
       LoopMBB = SystemZ::emitBlockAfter(StartMBB);
@@ -7916,7 +7956,6 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
 
       DestBase = MachineOperand::CreateReg(NextDestReg, false);
       SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
-      ImmLength &= 255;
       if (EndMBB && !ImmLength)
         // If the loop handled the whole CLC range, DoneMBB will be empty with
         // CC live-through into EndMBB, so add it as live-in.
@@ -7987,7 +8026,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     MBB->addSuccessor(DoneMBB);
 
     MBB = DoneMBB;
-    if (LengthMO.isReg()) {
+    if (IsRegForm) {
       // DoneMBB:
       // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
       // # Use EXecute Relative Long for the remainder of the bytes. The target
@@ -8005,7 +8044,6 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
         BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
           .addReg(StartSrcReg).addMBB(StartMBB)
           .addReg(NextSrcReg).addMBB(LoopMBB);
-      MRI.constrainRegClass(LenMinus1Reg, &SystemZ::ADDR64BitRegClass);
       BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
         .addImm(Opcode)
         .addReg(LenMinus1Reg)
@@ -8530,21 +8568,16 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
 
   case SystemZ::ATOMIC_CMP_SWAPW:
     return emitAtomicCmpSwapW(MI, MBB);
-  case SystemZ::MVCSequence:
-  case SystemZ::MVCLoop:
+  case SystemZ::MVCImm:
     return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
-  case SystemZ::NCSequence:
-  case SystemZ::NCLoop:
+  case SystemZ::NCImm:
     return emitMemMemWrapper(MI, MBB, SystemZ::NC);
-  case SystemZ::OCSequence:
-  case SystemZ::OCLoop:
+  case SystemZ::OCImm:
     return emitMemMemWrapper(MI, MBB, SystemZ::OC);
-  case SystemZ::XCSequence:
-  case SystemZ::XCLoop:
-  case SystemZ::XCLoopVarLen:
+  case SystemZ::XCImm:
+  case SystemZ::XCReg:
     return emitMemMemWrapper(MI, MBB, SystemZ::XC);
-  case SystemZ::CLCSequence:
-  case SystemZ::CLCLoop:
+  case SystemZ::CLCImm:
     return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
   case SystemZ::CLSTLoop:
     return emitStringWrapper(MI, MBB, SystemZ::CLST);

diff  --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 337164d55e5fd..7cbe125533d30 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -128,9 +128,10 @@ let Predicates = [FeatureNoVectorEnhancements1] in {
                                     (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 }
 
-defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
-defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
-defm LoadStoreF128 : MVCLoadStore<load, f128, MVCSequence, 16>;
+// The length is given as one less for MVCImm.
+defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCImm, 3>;
+defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCImm, 7>;
+defm LoadStoreF128 : MVCLoadStore<load, f128, MVCImm, 15>;
 
 //===----------------------------------------------------------------------===//
 // Load instructions

diff  --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index 5cb46cdb36a60..76819dc85413d 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5329,42 +5329,33 @@ multiclass CondUnaryRSYPseudoAndMemFold<string mnemonic,
 
 // Define an instruction that operates on two fixed-length blocks of memory,
 // and associated pseudo instructions for operating on blocks of any size.
-// The Sequence form uses a straight-line sequence of instructions and
-// the Loop form uses a loop of length-256 instructions followed by
-// another instruction to handle the excess.
-// The LoopVarLen form is for a loop with a non-constant length parameter.
-multiclass MemorySS<string mnemonic, bits<8> opcode,
-                    SDPatternOperator sequence, SDPatternOperator loop> {
+// There are two pseudos for the 
diff erent cases of when the length is
+// constant or variable. The length operand of a pseudo is actually one less
+// than the intended number of bytes, since the register case needs to use an
+// EXRL with a target instruction that adds one to the length always.
+multiclass MemorySS<string mnemonic, bits<8> opcode, SDPatternOperator memop> {
   def "" : SideEffectBinarySSa<mnemonic, opcode>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CC] in {
-    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                       imm64:$length),
-                           [(sequence bdaddr12only:$dest, bdaddr12only:$src,
-                                      imm64:$length)]>;
-    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                   imm64:$length, GR64:$count256),
-                      [(loop bdaddr12only:$dest, bdaddr12only:$src,
-                             imm64:$length, GR64:$count256)]>;
-    def LoopVarLen : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                         GR64:$length, GR64:$count256),
-                            [(loop bdaddr12only:$dest, bdaddr12only:$src,
-                                   GR64:$length, GR64:$count256)]>;
+    def Imm : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                  imm64:$length),
+                             [(memop bdaddr12only:$dest, bdaddr12only:$src,
+                                     imm64:$length)]>;
+    def Reg : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                  ADDR64:$length),
+                             [(memop bdaddr12only:$dest, bdaddr12only:$src,
+                                     ADDR64:$length)]>;
   }
 }
 
 // The same, but setting a CC result as comparison operator.
 multiclass CompareMemorySS<string mnemonic, bits<8> opcode,
-                          SDPatternOperator sequence, SDPatternOperator loop> {
+                           SDPatternOperator memop> {
   def "" : SideEffectBinarySSa<mnemonic, opcode>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
-    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                       imm64:$length),
-                           [(set CC, (sequence bdaddr12only:$dest, bdaddr12only:$src,
-                                               imm64:$length))]>;
-    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                   imm64:$length, GR64:$count256),
-                      [(set CC, (loop bdaddr12only:$dest, bdaddr12only:$src,
-                                      imm64:$length, GR64:$count256))]>;
+    def Imm : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                  imm64:$length),
+                          [(set CC, (memop bdaddr12only:$dest, bdaddr12only:$src,
+                                           imm64:$length))]>;
   }
 }
 

diff  --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 7df7cc93d6eb1..41d84c8c49ecc 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -503,7 +503,7 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
 
 // Memory-to-memory moves.
 let mayLoad = 1, mayStore = 1 in
-  defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+  defm MVC : MemorySS<"mvc", 0xD2, z_mvc>;
 let mayLoad = 1, mayStore = 1, Defs = [CC] in {
   def MVCL  : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>;
   def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>;
@@ -1200,7 +1200,7 @@ let Defs = [CC] in {
 
   // Block AND.
   let mayLoad = 1, mayStore = 1 in
-    defm NC : MemorySS<"nc", 0xD4, z_nc, z_nc_loop>;
+    defm NC : MemorySS<"nc", 0xD4, z_nc>;
 }
 defm : RMWIByte<and, bdaddr12pair, NI>;
 defm : RMWIByte<and, bdaddr20pair, NIY>;
@@ -1257,7 +1257,7 @@ let Defs = [CC] in {
 
   // Block OR.
   let mayLoad = 1, mayStore = 1 in
-    defm OC : MemorySS<"oc", 0xD6, z_oc, z_oc_loop>;
+    defm OC : MemorySS<"oc", 0xD6, z_oc>;
 }
 defm : RMWIByte<or, bdaddr12pair, OI>;
 defm : RMWIByte<or, bdaddr20pair, OIY>;
@@ -1297,7 +1297,7 @@ let Defs = [CC] in {
 
   // Block XOR.
   let mayLoad = 1, mayStore = 1 in
-    defm XC : MemorySS<"xc", 0xD7, z_xc, z_xc_loop>;
+    defm XC : MemorySS<"xc", 0xD7, z_xc>;
 }
 defm : RMWIByte<xor, bdaddr12pair, XI>;
 defm : RMWIByte<xor, bdaddr20pair, XIY>;
@@ -1624,7 +1624,7 @@ defm : ZXB<z_ucmp, GR64, CLGFR>;
 
 // Memory-to-memory comparison.
 let mayLoad = 1, Defs = [CC] in {
-  defm CLC : CompareMemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+  defm CLC : CompareMemorySS<"clc", 0xD5, z_clc>;
   def CLCL  : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>;
   def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>;
   def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>;
@@ -2355,21 +2355,15 @@ let AddedComplexity = 4 in {
             (RLLG GR64:$val, (LCR GR32:$shift), 0)>;
 }
 
-// Peepholes for turning scalar operations into block operations.
-defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 1>;
-defm : BlockLoadStore<anyextloadi16, i32, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 2>;
-defm : BlockLoadStore<load, i32, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 4>;
-defm : BlockLoadStore<anyextloadi8, i64, MVCSequence, NCSequence,
-                      OCSequence, XCSequence, 1>;
-defm : BlockLoadStore<anyextloadi16, i64, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 2>;
-defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 4>;
-defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 8>;
+// Peepholes for turning scalar operations into block operations.  The length
+// is given as one less for these pseudos.
+defm : BlockLoadStore<anyextloadi8, i32, MVCImm, NCImm, OCImm, XCImm, 0>;
+defm : BlockLoadStore<anyextloadi16, i32, MVCImm, NCImm, OCImm, XCImm, 1>;
+defm : BlockLoadStore<load, i32, MVCImm, NCImm, OCImm, XCImm, 3>;
+defm : BlockLoadStore<anyextloadi8, i64, MVCImm, NCImm, OCImm, XCImm, 0>;
+defm : BlockLoadStore<anyextloadi16, i64, MVCImm, NCImm, OCImm, XCImm, 1>;
+defm : BlockLoadStore<anyextloadi32, i64, MVCImm, NCImm, OCImm, XCImm, 3>;
+defm : BlockLoadStore<load, i64, MVCImm, NCImm, OCImm, XCImm, 7>;
 
 //===----------------------------------------------------------------------===//
 // Mnemonic Aliases

diff  --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 992b1512a0779..927d972332864 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -102,17 +102,6 @@ def SDT_ZMemMemLengthCC     : SDTypeProfile<1, 3,
                                              SDTCisPtrTy<1>,
                                              SDTCisPtrTy<2>,
                                              SDTCisVT<3, i64>]>;
-def SDT_ZMemMemLoop         : SDTypeProfile<0, 4,
-                                            [SDTCisPtrTy<0>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisVT<2, i64>,
-                                             SDTCisVT<3, i64>]>;
-def SDT_ZMemMemLoopCC       : SDTypeProfile<1, 4,
-                                            [SDTCisVT<0, i32>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisPtrTy<2>,
-                                             SDTCisVT<3, i64>,
-                                             SDTCisVT<4, i64>]>;
 def SDT_ZString             : SDTypeProfile<1, 3,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
@@ -416,24 +405,14 @@ def z_atomic_cmp_swap_128 : SDNode<"SystemZISD::ATOMIC_CMP_SWAP_128",
 
 def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_mvc_loop          : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
-                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_nc_loop           : SDNode<"SystemZISD::NC_LOOP", SDT_ZMemMemLoop,
-                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_oc_loop           : SDNode<"SystemZISD::OC_LOOP", SDT_ZMemMemLoop,
-                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_xc_loop           : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
-                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
-def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoopCC,
-                                 [SDNPHasChain, SDNPMayLoad]>;
 def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
 def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,

diff  --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 4a9ea69d101c2..edf4a7ce20d4b 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -17,32 +17,34 @@ using namespace llvm;
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-// Decide whether it is best to use a loop or straight-line code for
-// a block operation of Size bytes with source address Src and destination
-// address Dest.  Sequence is the opcode to use for straight-line code
-// (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP).
-// Return the chain for the completed operation.
-static SDValue emitMemMem(SelectionDAG &DAG, const SDLoc &DL, unsigned Sequence,
-                          unsigned Loop, SDValue Chain, SDValue Dst,
-                          SDValue Src, uint64_t Size) {
-  EVT PtrVT = Src.getValueType();
-  // The heuristic we use is to prefer loops for anything that would
-  // require 7 or more MVCs.  With these kinds of sizes there isn't
-  // much to choose between straight-line code and looping code,
-  // since the time will be dominated by the MVCs themselves.
-  // However, the loop has 4 or 5 instructions (depending on whether
-  // the base addresses can be proved equal), so there doesn't seem
-  // much point using a loop for 5 * 256 bytes or fewer.  Anything in
-  // the range (5 * 256, 6 * 256) will need another instruction after
-  // the loop, so it doesn't seem worth using a loop then either.
-  // The next value up, 6 * 256, can be implemented in the same
-  // number of straight-line MVCs as 6 * 256 - 1.
-  if (Size > 6 * 256)
-    return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,
-                       DAG.getConstant(Size, DL, PtrVT),
-                       DAG.getConstant(Size / 256, DL, PtrVT));
-  return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,
-                     DAG.getConstant(Size, DL, PtrVT));
+// Emit a mem-mem operation after subtracting one from size, which will be
+// added back during pseudo expansion. As the Reg case emitted here may be
+// converted by DAGCombiner into having an Imm length, they are both emitted
+// the same way.
+static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
+                             SDValue Chain, SDValue Dst, SDValue Src,
+                             uint64_t Size) {
+  return DAG.getNode(Op, DL, MVT::Other, Chain, Dst, Src,
+                     DAG.getConstant(Size - 1, DL, Src.getValueType()));
+}
+
+static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
+                             SDValue Chain, SDValue Dst, SDValue Src,
+                             SDValue Size) {
+  SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
+                                  DAG.getZExtOrTrunc(Size, DL, MVT::i64),
+                                  DAG.getConstant(-1, DL, MVT::i64));
+  return DAG.getNode(Op, DL, MVT::Other, Chain, Dst, Src, LenMinus1);
+}
+
+// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size).
+// One is subtracted from size also here, per above.
+static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                       SDValue Src1, SDValue Src2, uint64_t Size) {
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+  EVT PtrVT = Src1.getValueType();
+  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
+                     DAG.getConstant(Size - 1, DL, PtrVT));
 }
 
 SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
@@ -53,8 +55,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
     return SDValue();
 
   if (auto *CSize = dyn_cast<ConstantSDNode>(Size))
-    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
-                      Chain, Dst, Src, CSize->getZExtValue());
+    return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, Dst, Src,
+                         CSize->getZExtValue());
   return SDValue();
 }
 
@@ -127,52 +129,23 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
 
     // Handle the special case of a memset of 0, which can use XC.
     if (CByte && CByte->getZExtValue() == 0)
-      return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
-                        Chain, Dst, Dst, Bytes);
+      return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes);
 
     // Copy the byte to the first location and then use MVC to copy
     // it to the rest.
     Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
     SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                                    DAG.getConstant(1, DL, PtrVT));
-    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
-                      Chain, DstPlus1, Dst, Bytes - 1);
+    return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst,
+                         Bytes - 1);
   }
 
   // Variable length
-  if (CByte && CByte->getZExtValue() == 0) {
+  if (CByte && CByte->getZExtValue() == 0)
     // Handle the special case of a variable length memset of 0 with XC.
-    SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
-                                    DAG.getZExtOrTrunc(Size, DL, MVT::i64),
-                                    DAG.getConstant(-1, DL, MVT::i64));
-    SDValue TripC = DAG.getNode(ISD::SRL, DL, MVT::i64, LenMinus1,
-                                DAG.getConstant(8, DL, MVT::i64));
-    return DAG.getNode(SystemZISD::XC_LOOP, DL, MVT::Other, Chain, Dst, Dst,
-                       LenMinus1, TripC);
-  }
-  return SDValue();
-}
+    return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size);
 
-// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
-// deciding whether to use a loop or straight-line code.
-static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
-                       SDValue Src1, SDValue Src2, uint64_t Size) {
-  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
-  EVT PtrVT = Src1.getValueType();
-  // A two-CLC sequence is a clear win over a loop, not least because it
-  // needs only one branch.  A three-CLC sequence needs the same number
-  // of branches as a loop (i.e. 2), but is shorter.  That brings us to
-  // lengths greater than 768 bytes.  It seems relatively likely that
-  // a 
diff erence will be found within the first 768 bytes, so we just
-  // optimize for the smallest number of branch instructions, in order
-  // to avoid polluting the prediction buffer too much.  A loop only ever
-  // needs 2 branches, whereas a straight-line sequence would need 3 or more.
-  if (Size > 3 * 256)
-    return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2,
-                       DAG.getConstant(Size, DL, PtrVT),
-                       DAG.getConstant(Size / 256, DL, PtrVT));
-  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
-                     DAG.getConstant(Size, DL, PtrVT));
+  return SDValue();
 }
 
 // Convert the current CC value into an integer that is 0 if CC == 0,

diff  --git a/llvm/test/CodeGen/SystemZ/memset-05.ll b/llvm/test/CodeGen/SystemZ/memset-05.ll
index 5754660705631..6d3aa315f8bd3 100644
--- a/llvm/test/CodeGen/SystemZ/memset-05.ll
+++ b/llvm/test/CodeGen/SystemZ/memset-05.ll
@@ -48,37 +48,37 @@ define void @fun2(i8* %Addr, i32 %Len) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    llgfr %r1, %r3
 ; CHECK-NEXT:    aghi %r1, -1
-; CHECK-NEXT:    srlg %r0, %r1, 8
 ; CHECK-NEXT:    cgije %r1, -1, .LBB2_5
 ; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    srlg %r0, %r1, 8
 ; CHECK-NEXT:    lgr %r3, %r2
 ; CHECK-NEXT:    cgije %r0, 0, .LBB2_4
 ; CHECK-NEXT:  # %bb.2:
 ; CHECK-NEXT:    lgr %r3, %r2
-; CHECK-NEXT:    lgr %r4, %r0
 ; CHECK-NEXT:  .LBB2_3: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    xc 0(256,%r3), 0(%r3)
 ; CHECK-NEXT:    la %r3, 256(%r3)
-; CHECK-NEXT:    brctg %r4, .LBB2_3
+; CHECK-NEXT:    brctg %r0, .LBB2_3
 ; CHECK-NEXT:  .LBB2_4:
 ; CHECK-NEXT:    exrl %r1, .Ltmp1
 ; CHECK-NEXT:  .LBB2_5:
 ; CHECK-NEXT:    cgije %r1, -1, .LBB2_10
 ; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    srlg %r0, %r1, 8
 ; CHECK-NEXT:    lgr %r3, %r2
 ; CHECK-NEXT:    cgije %r0, 0, .LBB2_9
 ; CHECK-NEXT:  # %bb.7:
 ; CHECK-NEXT:    lgr %r3, %r2
-; CHECK-NEXT:    lgr %r4, %r0
 ; CHECK-NEXT:  .LBB2_8: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    xc 0(256,%r3), 0(%r3)
 ; CHECK-NEXT:    la %r3, 256(%r3)
-; CHECK-NEXT:    brctg %r4, .LBB2_8
+; CHECK-NEXT:    brctg %r0, .LBB2_8
 ; CHECK-NEXT:  .LBB2_9:
 ; CHECK-NEXT:    exrl %r1, .Ltmp1
 ; CHECK-NEXT:  .LBB2_10:
 ; CHECK-NEXT:    cgibe %r1, -1, 0(%r14)
 ; CHECK-NEXT:  .LBB2_11:
+; CHECK-NEXT:    srlg %r0, %r1, 8
 ; CHECK-NEXT:    cgije %r0, 0, .LBB2_13
 ; CHECK-NEXT:  .LBB2_12: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    xc 0(256,%r2), 0(%r2)
@@ -114,6 +114,135 @@ define void @fun3(i64 %Len) {
   ret void
 }
 
+; Test that a memset with a length argument that DAGCombiner will convert
+; into a constant get the correct number of bytes set.
+ at Data = external hidden constant [1024 x i8], align 2
+define void @fun4() {
+; CHECK-LABEL: fun4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, Data
+; CHECK-NEXT:    xc 35(256,%r1), 35(%r1)
+; CHECK-NEXT:    xc 291(256,%r1), 291(%r1)
+; CHECK-NEXT:    xc 547(256,%r1), 547(%r1)
+; CHECK-NEXT:    xc 803(221,%r1), 803(%r1)
+; CHECK-NEXT:    mvghi 0(%r1), 989
+; CHECK-NEXT:    br %r14
+  call void @llvm.memset.p0i8.i64(
+     i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35),
+     i8 0,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 0) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 0, i64 35) to i64), i64 1)),
+     i1 false)
+  %i11 = getelementptr i8, i8* null,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 0) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 0, i64 35) to i64), i64 1))
+  store i8* %i11, i8** undef, align 8
+  ret void
+}
+
+; The same, with a resulting constant length of 0.
+define void @fun5() {
+; CHECK-LABEL: fun5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvghi 0(%r1), 0
+; CHECK-NEXT:    br %r14
+  call void @llvm.memset.p0i8.i64(
+     i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35),
+     i8 0,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)),
+     i1 false)
+  %i11 = getelementptr i8, i8* null,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1))
+  store i8* %i11, i8** undef, align 8
+  ret void
+}
+
+; The same, with a resulting constant length of 1.
+define void @fun6() {
+; CHECK-LABEL: fun6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, Data
+; CHECK-NEXT:    xc 35(1,%r1), 35(%r1)
+; CHECK-NEXT:    mvghi 0(%r1), 1
+; CHECK-NEXT:    br %r14
+  call void @llvm.memset.p0i8.i64(
+     i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35),
+     i8 0,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 36) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)),
+     i1 false)
+  %i11 = getelementptr i8, i8* null,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 36) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1))
+  store i8* %i11, i8** undef, align 8
+  ret void
+}
+
+; The same, with a resulting constant length of 256.
+define void @fun7() {
+; CHECK-LABEL: fun7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, Data
+; CHECK-NEXT:    xc 35(256,%r1), 35(%r1)
+; CHECK-NEXT:    mvghi 0(%r1), 256
+; CHECK-NEXT:    br %r14
+  call void @llvm.memset.p0i8.i64(
+     i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35),
+     i8 0,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 291) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)),
+     i1 false)
+  %i11 = getelementptr i8, i8* null,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 291) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1))
+  store i8* %i11, i8** undef, align 8
+  ret void
+}
+
+; The same, with a resulting constant length of 257.
+define void @fun8() {
+; CHECK-LABEL: fun8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, Data
+; CHECK-NEXT:    xc 35(256,%r1), 35(%r1)
+; CHECK-NEXT:    xc 291(1,%r1), 291(%r1)
+; CHECK-NEXT:    mvghi 0(%r1), 257
+; CHECK-NEXT:    br %r14
+  call void @llvm.memset.p0i8.i64(
+     i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35),
+     i8 0,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 292) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)),
+     i1 false)
+  %i11 = getelementptr i8, i8* null,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 292) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1))
+  store i8* %i11, i8** undef, align 8
+  ret void
+}
+
 ; CHECK:       .Ltmp2:
 ; CHECK-NEXT: 	 xc 0(1,%r1), 0(%r1)
 ; CHECK-NEXT:  .Ltmp0:

diff  --git a/llvm/test/CodeGen/SystemZ/mverify-optypes.mir b/llvm/test/CodeGen/SystemZ/mverify-optypes.mir
index cc0be44233488..b9d53f44e9eb1 100644
--- a/llvm/test/CodeGen/SystemZ/mverify-optypes.mir
+++ b/llvm/test/CodeGen/SystemZ/mverify-optypes.mir
@@ -31,8 +31,8 @@ body:             |
     %4:addr64bit = LARL @gsrc
     %4:addr64bit = LARL $r2l
 
-    MVCLoop %4, 0, %3, 0, 1680, %0, implicit-def $cc
-    MVCLoop %4, 0, %3, 0, %1, %0, implicit-def $cc
+    MVCImm %4, 0, %3, 0, 1680, implicit-def $cc
+    MVCImm %4, 0, %3, 0, %1, implicit-def $cc
 
     BCR 0, 0, $r2d, implicit $cc
     BCR 0, $r2d, $r2d, implicit $cc
@@ -63,7 +63,7 @@ body:             |
 # CHECK: - operand 1:   $r2l
 
 # CHECK: *** Bad machine code: Expected a non-register operand. ***
-# CHECK: - instruction: MVCLoop %4:addr64bit, 0, %3:addr64bit, 0, %1:addr64bit, %0:gr64bit, implicit-def $cc
+# CHECK: - instruction: MVCImm %4:addr64bit, 0, %3:addr64bit, 0, %1:addr64bit, implicit-def $cc
 # CHECK: - operand 4:   %1:addr64bit
 
 # CHECK: *** Bad machine code: Expected a non-register operand. ***