[llvm] [SystemZ] Don't lower float/double ATOMIC_[LOAD|STORE] to [LOAD|STORE] (PR #75879)

Fri Feb 9 03:03:27 PST 2024

https://github.com/JonPsson1 updated https://github.com/llvm/llvm-project/pull/75879

>From ddba819ca007082be59a9ffc1a96e98039c5b6fd Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Mon, 18 Dec 2023 18:49:11 -0600
Subject: [PATCH 1/3] [SystemZ] Don't lower ATOMIC_[LOAD|STORE] to [LOAD|STORE]
 (Use PatFrags for loads.) Try to convert to LOAD in select() instead. Was
 e20dad7

---
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |  11 +
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  13 +
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  10 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |  12 +
 .../Target/SystemZ/SystemZISelDAGToDAG.cpp    |  66 ++
 .../Target/SystemZ/SystemZISelLowering.cpp    | 140 ++--
 llvm/lib/Target/SystemZ/SystemZISelLowering.h |   8 +-
 llvm/lib/Target/SystemZ/SystemZInstrFP.td     |   8 +-
 llvm/lib/Target/SystemZ/SystemZOperators.td   |   4 +
 llvm/test/CodeGen/SystemZ/atomic-load-06.ll   |   4 +-
 llvm/test/CodeGen/SystemZ/atomic-memofolds.ll | 723 ++++++++++++++++++
 llvm/test/CodeGen/SystemZ/atomic-store-06.ll  |   5 +-
 12 files changed, 945 insertions(+), 59 deletions(-)
 create mode 100644 llvm/test/CodeGen/SystemZ/atomic-memofolds.ll

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 3130f6c4dce598..d7468f936d2270 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -558,6 +558,7 @@ BEGIN_TWO_BYTE_PACK()
 
   class LoadSDNodeBitfields {
     friend class LoadSDNode;
+    friend class AtomicSDNode;
     friend class VPLoadSDNode;
     friend class VPStridedLoadSDNode;
     friend class MaskedLoadSDNode;
@@ -1473,6 +1474,16 @@ class AtomicSDNode : public MemSDNode {
             MMO->isAtomic()) && "then why are we using an AtomicSDNode?");
   }
 
+  void setExtensionType(ISD::LoadExtType ETy) {
+    assert(getOpcode() == ISD::ATOMIC_LOAD && "Only used for atomic loads.");
+    LoadSDNodeBits.ExtTy = ETy;
+  }
+
+  ISD::LoadExtType getExtensionType() const {
+    assert(getOpcode() == ISD::ATOMIC_LOAD && "Only used for atomic loads.");
+    return static_cast<ISD::LoadExtType>(LoadSDNodeBits.ExtTy);
+  }
+
   const SDValue &getBasePtr() const {
     return getOpcode() == ISD::ATOMIC_STORE ? getOperand(2) : getOperand(1);
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 39b7e061554141..7642b6f4160a7b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -340,6 +340,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
                               N->getMemoryVT(), ResVT,
                               N->getChain(), N->getBasePtr(),
                               N->getMemOperand());
+  if (N->getOpcode() == ISD::ATOMIC_LOAD) {
+    ISD::LoadExtType ETy = cast<AtomicSDNode>(N)->getExtensionType();
+    if (ETy == ISD::NON_EXTLOAD) {
+      if (TLI.getExtendForAtomicOps() == ISD::SIGN_EXTEND)
+        ETy = ISD::SEXTLOAD;
+      else if (TLI.getExtendForAtomicOps() == ISD::ZERO_EXTEND)
+        ETy = ISD::ZEXTLOAD;
+      else
+        ETy = ISD::EXTLOAD;
+    }
+    cast<AtomicSDNode>(Res)->setExtensionType(ETy);
+  }
+
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 55eee780d512c8..c38789d26e8e1f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4037,6 +4037,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     if (Op.getResNo() == 0) {
       if (TLI->getExtendForAtomicOps() == ISD::ZERO_EXTEND)
         Known.Zero.setBitsFrom(MemBits);
+      else if (Op->getOpcode() == ISD::ATOMIC_LOAD &&
+               cast<AtomicSDNode>(Op)->getExtensionType() == ISD::ZEXTLOAD)
+        Known.Zero.setBitsFrom(MemBits);
     }
     break;
   }
@@ -4848,6 +4851,13 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
         return VTBits - Tmp + 1;
       if (TLI->getExtendForAtomicOps() == ISD::ZERO_EXTEND)
         return VTBits - Tmp;
+      if (Op->getOpcode() == ISD::ATOMIC_LOAD) {
+        ISD::LoadExtType ETy = cast<AtomicSDNode>(Op)->getExtensionType();
+        if (ETy == ISD::SEXTLOAD)
+          return VTBits - Tmp + 1;
+        if (ETy == ISD::ZEXTLOAD)
+          return VTBits - Tmp;
+      }
     }
     break;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a28d834f0522f2..20d627b679b571 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -828,6 +828,18 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   } else if (const MemSDNode *M = dyn_cast<MemSDNode>(this)) {
     OS << "<";
     printMemOperand(OS, *M->getMemOperand(), G);
+    if (auto *A = dyn_cast<AtomicSDNode>(M))
+      if (A->getOpcode() == ISD::ATOMIC_LOAD) {
+        bool doExt = true;
+        switch (A->getExtensionType()) {
+        default: doExt = false; break;
+        case ISD::EXTLOAD:  OS << ", anyext"; break;
+        case ISD::SEXTLOAD: OS << ", sext"; break;
+        case ISD::ZEXTLOAD: OS << ", zext"; break;
+        }
+        if (doExt)
+          OS << " from " << A->getMemoryVT();
+      }
     OS << ">";
   } else if (const BlockAddressSDNode *BA =
                dyn_cast<BlockAddressSDNode>(this)) {
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 815eca1240d827..f4e6081ae82104 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -347,6 +347,9 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // Try to expand a boolean SELECT_CCMASK using an IPM sequence.
   SDValue expandSelectBoolean(SDNode *Node);
 
+  // Convert ATOMIC_LOADs to LOADs to facilitate instruction selection.
+  void convertATOMIC_LOADs(SDNode *Node, unsigned Depth = 0);
+
 public:
   static char ID;
 
@@ -1513,6 +1516,10 @@ bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
   MachineMemOperand *MMO = MemAccess->getMemOperand();
   assert(MMO && "Expected a memory operand.");
 
+  // These instructions are not atomic.
+  if (MMO->isAtomic())
+    return false;
+
   // The memory access must have a proper alignment and no index register.
   if (MemAccess->getAlign().value() < StoreSize ||
       !MemAccess->getOffset().isUndef())
@@ -1545,6 +1552,37 @@ bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
   return true;
 }
 
+// This is a hack to convert ATOMIC_LOADs to LOADs in the last minute just
+// before instruction selection begins. It would have been easier if
+// ATOMIC_LOAD nodes would instead always be built by SelectionDAGBuilder as
+// LOADs with an atomic MMO and properly handled as such in DAGCombiner, but
+// until that changes they need to remain as ATOMIC_LOADs until all
+// DAGCombining is done.  Convert Node or any of its operands from
+// ATOMIC_LOAD to LOAD.
+void SystemZDAGToDAGISel::convertATOMIC_LOADs(SDNode *Node, unsigned Depth) {
+  if (Depth > 1) // Chain operands are also followed so this seems enough.
+    return;
+  if (Node->getOpcode() == ISD::ATOMIC_LOAD) {
+    auto *ALoad = cast<AtomicSDNode>(Node);
+    // It seems necessary to morph the node as it is not yet being selected.
+    LoadSDNode *Ld = cast<LoadSDNode>(CurDAG->MorphNodeTo(
+        ALoad, ISD::LOAD, CurDAG->getVTList(ALoad->getValueType(0), MVT::Other),
+        {ALoad->getChain(), ALoad->getBasePtr()}));
+    // Sanity check the morph.  The extension type for an extending load
+    // should have been set prior to instruction selection and remain in the
+    // morphed node.
+    assert(((SDNode *)Ld) == ((SDNode *)ALoad) && "Bad CSE on atomic load.");
+    assert(Ld->getMemOperand()->isAtomic() && "Broken MMO.");
+    ISD::LoadExtType ETy = Ld->getExtensionType();
+    bool IsNonExt = Ld->getMemoryVT().getSizeInBits() ==
+                    Ld->getValueType(0).getSizeInBits();
+    assert(IsNonExt == (ETy == ISD::NON_EXTLOAD) && "Bad extension type.");
+    return;
+  }
+  for (SDValue Op : Node->ops())
+    convertATOMIC_LOADs(Op.getNode(), ++Depth);
+}
+
 void SystemZDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
@@ -1553,6 +1591,9 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
     return;
   }
 
+  // Prepare any ATOMIC_LOAD to be selected as a LOAD with an atomic MMO.
+  convertATOMIC_LOADs(Node);
+
   unsigned Opcode = Node->getOpcode();
   switch (Opcode) {
   case ISD::OR:
@@ -1742,6 +1783,31 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+
+  case ISD::ATOMIC_STORE: {
+    auto *AtomOp = cast<AtomicSDNode>(Node);
+    // Store FP values directly without first moving to a GPR.
+    EVT SVT = AtomOp->getMemoryVT();
+    SDValue StoredVal = AtomOp->getVal();
+    if (SVT.isInteger() && StoredVal->getOpcode() == ISD::BITCAST &&
+        StoredVal->getOperand(0).getValueType().isFloatingPoint()) {
+      StoredVal = StoredVal->getOperand(0);
+      SVT = StoredVal.getValueType();
+    }
+    StoreSDNode *St = cast<StoreSDNode>(CurDAG->getTruncStore(
+        AtomOp->getChain(), SDLoc(AtomOp), StoredVal, AtomOp->getBasePtr(), SVT,
+        AtomOp->getMemOperand()));
+    assert(St->getMemOperand()->isAtomic() && "Broken MMO.");
+    SDNode *Chain = St;
+    // We have to enforce sequential consistency by performing a
+    // serialization operation after the store.
+    if (AtomOp->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent)
+      Chain = CurDAG->getMachineNode(SystemZ::Serialize, SDLoc(AtomOp),
+                                     MVT::Other, SDValue(Chain, 0));
+    ReplaceNode(Node, Chain);
+    SelectCode(St);
+    return;
+  }
   }
 
   SelectCode(Node);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index d92586f7d05d0d..39bd1aea6979c4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -194,11 +194,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UADDO_CARRY, VT, Custom);
       setOperationAction(ISD::USUBO_CARRY, VT, Custom);
 
-      // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
-      // stores, putting a serialization instruction after the stores.
-      setOperationAction(ISD::ATOMIC_LOAD,  VT, Custom);
-      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
-
       // Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
       // available, or if the operand is constant.
       setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
@@ -700,7 +695,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
 
   // Codes for which we want to perform some z-specific combinations.
-  setTargetDAGCombine({ISD::ZERO_EXTEND,
+  setTargetDAGCombine({ISD::BITCAST,
+                       ISD::ZERO_EXTEND,
                        ISD::SIGN_EXTEND,
                        ISD::SIGN_EXTEND_INREG,
                        ISD::LOAD,
@@ -920,6 +916,22 @@ bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const
   return false;
 }
 
+TargetLowering::AtomicExpansionKind
+SystemZTargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
+  // Lower fp128 the same way as i128.
+  if (LI->getType()->isFP128Ty())
+    return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
+TargetLowering::AtomicExpansionKind
+SystemZTargetLowering::shouldCastAtomicStoreInIR(StoreInst *SI) const {
+  // Lower fp128 the same way as i128.
+  if (SI->getValueOperand()->getType()->isFP128Ty())
+    return AtomicExpansionKind::CastToInteger;
+  return AtomicExpansionKind::None;
+}
+
 TargetLowering::AtomicExpansionKind
 SystemZTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   // Don't expand subword operations as they require special treatment.
@@ -4503,40 +4515,14 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
-// Op is an atomic load.  Lower it into a normal volatile load.
-SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
-                                                SelectionDAG &DAG) const {
+SDValue SystemZTargetLowering::lowerATOMIC_I128_LDST(SDValue Op,
+                                                     SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
-  if (Node->getMemoryVT() == MVT::i128) {
-    // Use same code to handle both legal and non-legal i128 types.
-    SmallVector<SDValue, 2> Results;
-    LowerOperationWrapper(Node, Results, DAG);
-    return DAG.getMergeValues(Results, SDLoc(Op));
-  }
-  return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
-                        Node->getChain(), Node->getBasePtr(),
-                        Node->getMemoryVT(), Node->getMemOperand());
-}
-
-// Op is an atomic store.  Lower it into a normal volatile store.
-SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  auto *Node = cast<AtomicSDNode>(Op.getNode());
-  if (Node->getMemoryVT() == MVT::i128) {
-    // Use same code to handle both legal and non-legal i128 types.
-    SmallVector<SDValue, 1> Results;
-    LowerOperationWrapper(Node, Results, DAG);
-    return DAG.getMergeValues(Results, SDLoc(Op));
-  }
-  SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
-                                    Node->getBasePtr(), Node->getMemoryVT(),
-                                    Node->getMemOperand());
-  // We have to enforce sequential consistency by performing a
-  // serialization operation after the store.
-  if (Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent)
-    Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
-                                       MVT::Other, Chain), 0);
-  return Chain;
+  assert(Node->getMemoryVT() == MVT::i128 && "Only custom lowering i128.");
+  // Use same code to handle both legal and non-legal i128 types.
+  SmallVector<SDValue, 2> Results;
+  LowerOperationWrapper(Node, Results, DAG);
+  return DAG.getMergeValues(Results, SDLoc(Op));
 }
 
 // Prepare for a Compare And Swap for a subword operation. This needs to be
@@ -5659,9 +5645,13 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
   return GS.getNode(DAG, SDLoc(BVN));
 }
 
-bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
+bool SystemZTargetLowering::isVectorElementLoad(SDValue Op, EVT VecVT) const {
   if (Op.getOpcode() == ISD::LOAD && cast<LoadSDNode>(Op)->isUnindexed())
     return true;
+  if (auto *AL = dyn_cast<AtomicSDNode>(Op))
+    if (AL->getOpcode() == ISD::ATOMIC_LOAD && SDValue(AL, 0).hasOneUse() &&
+        AL->getMemoryVT() == VecVT.getScalarType())
+      return true;
   if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV)
     return true;
   return false;
@@ -5699,13 +5689,13 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   //   we would need 2 instructions to replicate it: VLVGP followed by VREPx.
   //   This is only a win if the single defined element is used more than once.
   //   In other cases we're better off using a single VLVGx.
-  if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single)))
+  if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single, VT)))
     return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
 
   // If all elements are loads, use VLREP/VLEs (below).
   bool AllLoads = true;
   for (auto Elem : Elems)
-    if (!isVectorElementLoad(Elem)) {
+    if (!isVectorElementLoad(Elem, VT)) {
       AllLoads = false;
       break;
     }
@@ -5777,7 +5767,7 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
     std::map<const SDNode*, unsigned> UseCounts;
     SDNode *LoadMaxUses = nullptr;
     for (unsigned I = 0; I < NumElements; ++I)
-      if (isVectorElementLoad(Elems[I])) {
+      if (isVectorElementLoad(Elems[I], VT)) {
         SDNode *Ld = Elems[I].getNode();
         UseCounts[Ld]++;
         if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld])
@@ -6138,9 +6128,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   case ISD::ATOMIC_SWAP:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
   case ISD::ATOMIC_STORE:
-    return lowerATOMIC_STORE(Op, DAG);
   case ISD::ATOMIC_LOAD:
-    return lowerATOMIC_LOAD(Op, DAG);
+    return lowerATOMIC_I128_LDST(Op, DAG);
   case ISD::ATOMIC_LOAD_ADD:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
   case ISD::ATOMIC_LOAD_SUB:
@@ -6587,6 +6576,52 @@ SDValue SystemZTargetLowering::combineTruncateExtract(
   return SDValue();
 }
 
+// Replace ALoad with a new ATOMIC_LOAD with a result that is extended to VT
+// per ETy.
+static SDValue extendAtomicLoad(AtomicSDNode *ALoad, EVT VT, SelectionDAG &DAG,
+                                ISD::LoadExtType ETy) {
+  if (VT.getSizeInBits() > 64)
+    return SDValue();
+  EVT OrigVT = ALoad->getValueType(0);
+  assert(OrigVT.getSizeInBits() < VT.getSizeInBits() && "VT should be wider.");
+  EVT MemoryVT = ALoad->getMemoryVT();
+  auto *NewALoad = dyn_cast<AtomicSDNode>(DAG.getAtomic(
+      ISD::ATOMIC_LOAD, SDLoc(ALoad), MemoryVT, VT, ALoad->getChain(),
+      ALoad->getBasePtr(), ALoad->getMemOperand()));
+  NewALoad->setExtensionType(ETy);
+  DAG.ReplaceAllUsesOfValueWith(
+      SDValue(ALoad, 0),
+      DAG.getNode(ISD::TRUNCATE, SDLoc(ALoad), OrigVT, SDValue(NewALoad, 0)));
+  // Update the chain uses.
+  DAG.ReplaceAllUsesOfValueWith(SDValue(ALoad, 1), SDValue(NewALoad, 1));
+  return SDValue(NewALoad, 0);
+}
+
+SDValue SystemZTargetLowering::combineBITCAST(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  EVT InVT = N0.getValueType();
+  EVT ResVT = N->getValueType(0);
+  // Handle atomic loads to load float/double values directly and not via a
+  // GPR. Do it before legalization to help in treating the ATOMIC_LOAD the
+  // same way as a LOAD, and e.g. emit a REPLICATE.
+  if (auto *ALoad = dyn_cast<AtomicSDNode>(N0))
+    if (ALoad->getOpcode() == ISD::ATOMIC_LOAD && InVT.getSizeInBits() <= 64 &&
+        ALoad->getExtensionType() == ISD::NON_EXTLOAD &&
+        SDValue(ALoad, 0).hasOneUse() && InVT.isInteger() &&
+        ResVT.isFloatingPoint()) {
+      SDValue Res = DAG.getAtomic(ISD::ATOMIC_LOAD, SDLoc(N), ResVT, ResVT,
+                                  ALoad->getChain(), ALoad->getBasePtr(),
+                                  ALoad->getMemOperand());
+      // Update the chain uses.
+      DAG.ReplaceAllUsesOfValueWith(SDValue(ALoad, 1), Res.getValue(1));
+      return Res;
+    }
+
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::combineZERO_EXTEND(
     SDNode *N, DAGCombinerInfo &DCI) const {
   // Convert (zext (select_ccmask C1, C2)) into (select_ccmask C1', C2')
@@ -6611,6 +6646,13 @@ SDValue SystemZTargetLowering::combineZERO_EXTEND(
       return NewSelect;
     }
   }
+
+  // Fold into ATOMIC_LOAD unless it is already sign extending.
+  if (auto *ALoad = dyn_cast<AtomicSDNode>(N0))
+    if (ALoad->getOpcode() == ISD::ATOMIC_LOAD &&
+        ALoad->getExtensionType() != ISD::SEXTLOAD)
+      return extendAtomicLoad(ALoad, VT, DAG, ISD::ZEXTLOAD);
+
   return SDValue();
 }
 
@@ -6662,6 +6704,13 @@ SDValue SystemZTargetLowering::combineSIGN_EXTEND(
       }
     }
   }
+
+  // Fold into ATOMIC_LOAD unless it is already zero extending.
+  if (auto *ALoad = dyn_cast<AtomicSDNode>(N0))
+    if (ALoad->getOpcode() == ISD::ATOMIC_LOAD &&
+        ALoad->getExtensionType() != ISD::ZEXTLOAD)
+      return extendAtomicLoad(ALoad, VT, DAG, ISD::SEXTLOAD);
+
   return SDValue();
 }
 
@@ -7633,6 +7682,7 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   switch(N->getOpcode()) {
   default: break;
+  case ISD::BITCAST:            return combineBITCAST(N, DCI);
   case ISD::ZERO_EXTEND:        return combineZERO_EXTEND(N, DCI);
   case ISD::SIGN_EXTEND:        return combineSIGN_EXTEND(N, DCI);
   case ISD::SIGN_EXTEND_INREG:  return combineSIGN_EXTEND_INREG(N, DCI);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index baf4ba41654879..9c442268dbb111 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -474,6 +474,8 @@ class SystemZTargetLowering : public TargetLowering {
     return VT != MVT::f64;
   }
   bool hasInlineStackProbe(const MachineFunction &MF) const override;
+  AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const override;
+  AtomicExpansionKind shouldCastAtomicStoreInIR(StoreInst *SI) const override;
   AtomicExpansionKind
   shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override;
   bool isLegalICmpImmediate(int64_t Imm) const override;
@@ -692,8 +694,7 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_I128_LDST(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
                               unsigned Opcode) const;
   SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
@@ -703,7 +704,7 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-  bool isVectorElementLoad(SDValue Op) const;
+  bool isVectorElementLoad(SDValue Op, EVT VecVT) const;
   SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
                       SmallVectorImpl<SDValue> &Elems) const;
   SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -723,6 +724,7 @@ class SystemZTargetLowering : public TargetLowering {
                          bool Force) const;
   SDValue combineTruncateExtract(const SDLoc &DL, EVT TruncVT, SDValue Op,
                                  DAGCombinerInfo &DCI) const;
+  SDValue combineBITCAST(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineZERO_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSIGN_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSIGN_EXTEND_INREG(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 6e67425c1e788b..8cafa0a936a404 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -495,8 +495,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
   def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;
 
-  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
-  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
+  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, nonatomic_ld, 4>;
+  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, nonatomic_ld, 8>;
 }
 
 // Fused multiply-subtract.
@@ -504,8 +504,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MSEBR : TernaryRRD<"msebr", 0xB30F, z_any_fms, FP32, FP32>;
   def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_any_fms, FP64, FP64>;
 
-  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
-  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
+  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, nonatomic_ld, 4>;
+  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, nonatomic_ld, 8>;
 }
 
 // Division.
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index d98bb886c18506..28815083daab0c 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -607,6 +607,10 @@ def nonvolatile_anyextloadi8  : NonvolatileLoad<anyextloadi8>;
 def nonvolatile_anyextloadi16 : NonvolatileLoad<anyextloadi16>;
 def nonvolatile_anyextloadi32 : NonvolatileLoad<anyextloadi32>;
 
+def nonatomic_ld : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return !cast<LoadSDNode>(N)->isAtomic();
+}]>;
+
 // Non-volatile stores.
 class NonvolatileStore<SDPatternOperator store>
   : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-06.ll b/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
index c9c5504520345c..d75f15a574f7ef 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
@@ -4,9 +4,7 @@
 
 define float @f1(ptr %src) {
 ; CHECK-LABEL: f1:
-; CHECK: lgf [[R:%r[0-9]+]], 0(%r2)
-; CHECK: sllg [[R]], [[R]], 32
-; CHECK: ldgr %f0, [[R]]
+; CHECK: le %f0
 ; CHECK: br %r14
   %val = load atomic float, ptr %src seq_cst, align 4
   ret float %val
diff --git a/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll b/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll
new file mode 100644
index 00000000000000..56c1eb2b85a8d5
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll
@@ -0,0 +1,723 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s
+
+; Sign-extending atomic loads.
+define void @f1(ptr %src, ptr %dst) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i16
+  store volatile i16 %s, ptr %dst
+  ret void
+}
+
+define void @f2(ptr %src, ptr %dst) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i32
+  store volatile i32 %s, ptr %dst
+  ret void
+}
+
+define void @f3(ptr %src, ptr %dst) {
+; CHECK-LABEL: f3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgb %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i64
+  store volatile i64 %s, ptr %dst
+  ret void
+}
+
+define void @f4(ptr %src, ptr %dst) {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lh %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %s = sext i16 %b to i32
+  store volatile i32 %s, ptr %dst
+  ret void
+}
+
+define void @f5(ptr %src, ptr %dst) {
+; CHECK-LABEL: f5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgh %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %s = sext i16 %b to i64
+  store volatile i64 %s, ptr %dst
+  ret void
+}
+
+define void @f6(ptr %src, ptr %dst) {
+; CHECK-LABEL: f6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgf %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %s = sext i32 %b to i64
+  store volatile i64 %s, ptr %dst
+  ret void
+}
+
+; Zero-extending atomic loads.
+define void @f7(ptr %src, ptr %dst) {
+; CHECK-LABEL: f7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llc %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i16
+  store volatile i16 %z, ptr %dst
+  ret void
+}
+
+define void @f8(ptr %src, ptr %dst) {
+; CHECK-LABEL: f8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llc %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i32
+  store volatile i32 %z, ptr %dst
+  ret void
+}
+
+define void @f9(ptr %src, ptr %dst) {
+; CHECK-LABEL: f9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i64
+  store volatile i64 %z, ptr %dst
+  ret void
+}
+
+define void @f10(ptr %src, ptr %dst) {
+; CHECK-LABEL: f10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llh %r0, 0(%r2)
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %z = zext i16 %b to i32
+  store volatile i32 %z, ptr %dst
+  ret void
+}
+
+define void @f11(ptr %src, ptr %dst) {
+; CHECK-LABEL: f11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgh %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %z = zext i16 %b to i64
+  store volatile i64 %z, ptr %dst
+  ret void
+}
+
+define void @f12(ptr %src, ptr %dst) {
+; CHECK-LABEL: f12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgf %r0, 0(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %z = zext i32 %b to i64
+  store volatile i64 %z, ptr %dst
+  ret void
+}
+
+; reg/mem
+define i64 @f13(i64 %a, ptr %src) {
+; CHECK-LABEL: f13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ag %r2, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i64, ptr %src seq_cst, align 8
+  %add = add i64 %a, %b
+  ret i64 %add
+}
+
+; reg/mem op with extension from memory.
+define i64 @f14(i64 %a, ptr %src) {
+; CHECK-LABEL: f14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slgf %r2, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %bext = zext i32 %b to i64
+  %sub = sub i64 %a, %bext
+  ret i64 %sub
+}
+
+; Check that maeb (reg/mem) is *not* used for an atomic load.
+define float @f15(float %f1, ptr %ptr, float %acc) {
+; CHECK-LABEL: f15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lde %f1, 0(%r2)
+; CHECK-NEXT:    wfmasb %f0, %f0, %f1, %f2
+; CHECK-NEXT:    br %r14
+  %f2 = load atomic float, ptr %ptr seq_cst, align 4
+  %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
+  ret float %res
+}
+declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
+
+; Do it twice for good measure given the involved DAG combines.
+define void @f16(ptr %src, ptr %dst) {
+; CHECK-LABEL: f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    lgbr %r1, %r0
+; CHECK-NEXT:    stg %r1, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    lgbr %r1, %r0
+; CHECK-NEXT:    stg %r1, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i64
+  %z = zext i8 %b to i64
+  store volatile i64 %s, ptr %dst
+  store volatile i64 %z, ptr %dst
+
+  %b2 = load atomic i8, ptr %src seq_cst, align 1
+  %s2 = sext i8 %b2 to i64
+  %z2 = zext i8 %b2 to i64
+  store volatile i64 %s2, ptr %dst
+  store volatile i64 %z2, ptr %dst
+
+  ret void
+}
+
+define void @f16_b(ptr %src, ptr %dst) {
+; CHECK-LABEL: f16_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lgb %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %s = sext i8 %b to i16
+  store volatile i16 %s, ptr %dst
+
+  %s2 = sext i8 %b to i64
+  store volatile i64 %s2, ptr %dst
+
+  ret void
+}
+
+define void @f16_c(ptr %src, ptr %dst) {
+; CHECK-LABEL: f16_c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llgc %r0, 0(%r2)
+; CHECK-NEXT:    sth %r0, 0(%r3)
+; CHECK-NEXT:    stg %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %z = zext i8 %b to i16
+  store volatile i16 %z, ptr %dst
+
+  %z2 = zext i8 %b to i64
+  store volatile i64 %z2, ptr %dst
+
+  ret void
+}
+
+; Check that two i8 loads use a reg/reg op.
+define i8 @f16_d(ptr %src, ptr %src2) {
+; CHECK-LABEL: f16_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r2, 0(%r2)
+; CHECK-NEXT:    lb %r0, 0(%r3)
+; CHECK-NEXT:    ar %r2, %r0
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %b2 = load atomic i8, ptr %src2 seq_cst, align 1
+  %add = add i8 %b, %b2
+  ret i8 %add
+}
+
+; Binary operations on a byte in memory, with an atomic load.
+define void @f17(ptr %ptr) {
+; CHECK-LABEL: f17:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ni 0(%r2), 1
+; CHECK-NEXT:    br %r14
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %xor = and i8 %val, -255
+  store i8 %xor, ptr %ptr
+  ret void
+}
+
+define void @f18(ptr %src) {
+; CHECK-LABEL: f18:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    oiy 4096(%r2), 1
+; CHECK-NEXT:    br %r14
+  %ptr = getelementptr i8, ptr %src, i64 4096
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %xor = or i8 %val, -255
+  store i8 %xor, ptr %ptr
+  ret void
+}
+
+define void @f19(ptr %src) {
+; CHECK-LABEL: f19:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xi 4095(%r2), 1
+; CHECK-NEXT:    br %r14
+  %ptr = getelementptr i8, ptr %src, i64 4095
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %xor = xor i8 %val, -255
+  store i8 %xor, ptr %ptr
+  ret void
+}
+
+; TM
+define double @f20(ptr %src, double %a, double %b) {
+; CHECK-LABEL: f20:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    tm 0(%r2), 1
+; CHECK-NEXT:    je .LBB22_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    ldr %f2, %f0
+; CHECK-NEXT:  .LBB22_2:
+; CHECK-NEXT:    ldr %f0, %f2
+; CHECK-NEXT:    br %r14
+  %byte = load atomic i8, ptr %src seq_cst, align 1
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; vector load and replicate
+define void @f21(ptr %src, ptr %dst) {
+; CHECK-LABEL: f21:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepb %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i8, ptr %src seq_cst, align 1
+  %v = insertelement <16 x i8> undef, i8 %b, i32 1
+  store volatile <16 x i8> %v, ptr %dst
+  ret void
+}
+
+define void @f22(ptr %src, ptr %dst) {
+; CHECK-LABEL: f22:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlreph %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i16, ptr %src seq_cst, align 2
+  %v = insertelement <8 x i16> undef, i16 %b, i32 1
+  store volatile <8 x i16> %v, ptr %dst
+  ret void
+}
+
+define void @f23(ptr %src, ptr %dst) {
+; CHECK-LABEL: f23:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepf %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i32, ptr %src seq_cst, align 4
+  %v = insertelement <4 x i32> undef, i32 %b, i32 2
+  store volatile <4 x i32> %v, ptr %dst
+  ret void
+}
+
+define void @f24(ptr %src, ptr %dst) {
+; CHECK-LABEL: f24:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepg %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic i64, ptr %src seq_cst, align 8
+  %v = insertelement <2 x i64> undef, i64 %b, i32 0
+  store volatile <2 x i64> %v, ptr %dst
+  ret void
+}
+
+define void @f25(ptr %src, ptr %dst) {
+; CHECK-LABEL: f25:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepf %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic float, ptr %src seq_cst, align 4
+  %v = insertelement <4 x float> undef, float %b, i32 1
+  store volatile <4 x float> %v, ptr %dst
+  ret void
+}
+
+define void @f25_b(ptr %src, ptr %dst) {
+; CHECK-LABEL: f25_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepf %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %l = load atomic i32, ptr %src seq_cst, align 4
+  %b = bitcast i32 %l to float
+  %v = insertelement <4 x float> undef, float %b, i32 1
+  store volatile <4 x float> %v, ptr %dst
+  ret void
+}
+
+define void @f26(ptr %src, ptr %dst) {
+; CHECK-LABEL: f26:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepg %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %b = load atomic double, ptr %src seq_cst, align 8
+  %v = insertelement <2 x double> undef, double %b, i32 0
+  store volatile <2 x double> %v, ptr %dst
+  ret void
+}
+
+define void @f26_b(ptr %src, ptr %dst) {
+; CHECK-LABEL: f26_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlrepg %v0, 0(%r2)
+; CHECK-NEXT:    vst %v0, 0(%r3), 3
+; CHECK-NEXT:    br %r14
+  %l = load atomic i64, ptr %src seq_cst, align 8
+  %b = bitcast i64 %l to double
+  %v = insertelement <2 x double> undef, double %b, i32 0
+  store volatile <2 x double> %v, ptr %dst
+  ret void
+}
+
+; Vector Load logical element and zero.
+define <16 x i8> @f27(ptr %ptr) {
+; CHECK-LABEL: f27:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezb %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @f28(ptr %ptr) {
+; CHECK-LABEL: f28:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezh %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i16, ptr %ptr seq_cst, align 2
+  %ret = insertelement <8 x i16> zeroinitializer, i16 %val, i32 3
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @f29(ptr %ptr) {
+; CHECK-LABEL: f29:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezf %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i32, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 1
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @f30(ptr %ptr) {
+; CHECK-LABEL: f30:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezg %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i64, ptr %ptr seq_cst, align 8
+  %ret = insertelement <2 x i64> zeroinitializer, i64 %val, i32 0
+  ret <2 x i64> %ret
+}
+
+define <4 x i32> @f31(ptr %ptr) {
+; CHECK-LABEL: f31:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezlf %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic i32, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 0
+  ret <4 x i32> %ret
+}
+
+define <4 x float> @f32(ptr %ptr) {
+; CHECK-LABEL: f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vllezlf %v24, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic float, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x float> zeroinitializer, float %val, i32 0
+  ret <4 x float> %ret
+}
+
+; Vector Load element.
+define <16 x i8> @f33(<16 x i8> %val, ptr %ptr) {
+; CHECK-LABEL: f33:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vleb %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i8, ptr %ptr seq_cst, align 1
+  %ret = insertelement <16 x i8> %val, i8 %element, i32 0
+  ret <16 x i8> %ret
+}
+
+define <8 x i16> @f34(<8 x i16> %val, ptr %ptr) {
+; CHECK-LABEL: f34:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vleh %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i16, ptr %ptr seq_cst, align 2
+  %ret = insertelement <8 x i16> %val, i16 %element, i32 0
+  ret <8 x i16> %ret
+}
+
+define <4 x i32> @f35(<4 x i32> %val, ptr %ptr) {
+; CHECK-LABEL: f35:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vlef %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i32, ptr %ptr seq_cst, align 4
+  %ret = insertelement <4 x i32> %val, i32 %element, i32 0
+  ret <4 x i32> %ret
+}
+
+define <2 x i64> @f36(<2 x i64> %val, ptr %ptr) {
+; CHECK-LABEL: f36:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vleg %v24, 0(%r2), 0
+; CHECK-NEXT:    br %r14
+  %element = load atomic i64, ptr %ptr seq_cst, align 8
+  %ret = insertelement <2 x i64> %val, i64 %element, i32 0
+  ret <2 x i64> %ret
+}
+
+; Test that fp values are loaded/stored directly. Clang FE currently always
+; emits atomic load/stores casted this way.
+define void @f37(ptr %src, ptr %dst) {
+; CHECK-LABEL: f37:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld %f0, 0(%r2)
+; CHECK-NEXT:    adbr %f0, %f0
+; CHECK-NEXT:    std %f0, 0(%r3)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %atomic-load = load atomic i64, ptr %src seq_cst, align 8
+  %bc0 = bitcast i64 %atomic-load to double
+  %fa = fadd double %bc0, %bc0
+  %bc1 = bitcast double %fa to i64
+  store atomic i64 %bc1, ptr %dst seq_cst, align 8
+  ret void
+}
+
+define void @f38(ptr %src, ptr %dst) {
+; CHECK-LABEL: f38:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lde %f0, 0(%r2)
+; CHECK-NEXT:    aebr %f0, %f0
+; CHECK-NEXT:    ste %f0, 0(%r3)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %atomic-load = load atomic i32, ptr %src seq_cst, align 8
+  %bc0 = bitcast i32 %atomic-load to float
+  %fa = fadd float %bc0, %bc0
+  %bc1 = bitcast float %fa to i32
+  store atomic i32 %bc1, ptr %dst seq_cst, align 8
+  ret void
+}
+
+; Test operation on memory involving atomic load and store.
+define void @f39(ptr %ptr) {
+; CHECK-LABEL: f39:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    oi 0(%r2), 1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %val = load atomic i8, ptr %ptr seq_cst, align 1
+  %or = or i8 %val, -255
+  store atomic i8 %or, ptr %ptr seq_cst, align 1
+  ret void
+}
+
+; Some atomic stores of immediates.
+define void @f40(ptr %ptr) {
+; CHECK-LABEL: f40:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvi 0(%r2), 128
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i8 128, ptr %ptr seq_cst, align 1
+  ret void
+}
+
+define void @f41(ptr %ptr) {
+; CHECK-LABEL: f41:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvhi 0(%r2), -1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i32 4294967295, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f42(ptr %ptr) {
+; CHECK-LABEL: f42:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvhi 0(%r2), -1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i32 4294967295, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f43(ptr %ptr) {
+; CHECK-LABEL: f43:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    llihl %r0, 255
+; CHECK-NEXT:    oilf %r0, 4294967295
+; CHECK-NEXT:    stg %r0, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic i64 1099511627775, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define void @f44(ptr %ptr) {
+; CHECK-LABEL: f44:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI48_0
+; CHECK-NEXT:    ld %f0, 0(%r1)
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  store atomic double 0x3ff0000020000000, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+; Vector Store Element.
+define void @f45(<16 x i8> %val, ptr %ptr) {
+; CHECK-LABEL: f45:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteb %v24, 0(%r2), 0
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <16 x i8> %val, i32 0
+  store atomic i8 %element, ptr %ptr seq_cst, align 1
+  ret void
+}
+
+define void @f46(<8 x i16> %val, ptr %base) {
+; CHECK-LABEL: f46:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteh %v24, 4094(%r2), 5
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %ptr = getelementptr i16, ptr %base, i32 2047
+  %element = extractelement <8 x i16> %val, i32 5
+  store atomic i16 %element, ptr %ptr seq_cst, align 2
+  ret void
+}
+
+define void @f47(<4 x i32> %val, ptr %ptr) {
+; CHECK-LABEL: f47:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vstef %v24, 0(%r2), 3
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <4 x i32> %val, i32 3
+  store atomic i32 %element, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f48(<2 x i64> %val, ptr %ptr) {
+; CHECK-LABEL: f48:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteg %v24, 0(%r2), 1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <2 x i64> %val, i32 1
+  store atomic i64 %element, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define void @f49(<4 x float> %val, ptr %ptr) {
+; CHECK-LABEL: f49:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vstef %v24, 0(%r2), 0
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <4 x float> %val, i32 0
+  store atomic float %element, ptr %ptr seq_cst, align 4
+  ret void
+}
+
+define void @f50(<2 x double> %val, ptr %ptr) {
+; CHECK-LABEL: f50:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsteg %v24, 0(%r2), 1
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %element = extractelement <2 x double> %val, i32 1
+  store atomic double %element, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define void @f51(ptr %src, ptr %dst) {
+; CHECK-LABEL: f51:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lpq %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vgmf %v1, 2, 8
+; CHECK-NEXT:    aebr %f0, %f1
+; CHECK-NEXT:    ste %f0, 0(%r3)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %atomic-load = load atomic i128, ptr %src seq_cst, align 16
+  %b0 = bitcast i128 %atomic-load to <4 x float>
+  %vecext = extractelement <4 x float> %b0, i64 0
+  %add = fadd float %vecext, 1.000000e+00
+  %b1 = bitcast float %add to i32
+  store atomic i32 %b1, ptr %dst seq_cst, align 4
+  ret void
+}
+
+define void @f52(ptr %src, ptr %dst) {
+; CHECK-LABEL: f52:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lpq %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vgmg %v1, 2, 11
+; CHECK-NEXT:    adbr %f0, %f1
+; CHECK-NEXT:    std %f0, 0(%r3)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %atomic-load = load atomic i128, ptr %src seq_cst, align 16
+  %b0 = bitcast i128 %atomic-load to <2 x double>
+  %vecext = extractelement <2 x double> %b0, i64 0
+  %add = fadd double %vecext, 1.000000e+00
+  %b1 = bitcast double %add to i64
+  store atomic i64 %b1, ptr %dst seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
index b748bfc767a4db..91e324b0af1a97 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
@@ -6,10 +6,7 @@
 define void @f1(ptr %src, float %val) {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $f0s killed $f0s def $f0d
-; CHECK-NEXT:    lgdr %r0, %f0
-; CHECK-NEXT:    srlg %r0, %r0, 32
-; CHECK-NEXT:    st %r0, 0(%r2)
+; CHECK-NEXT:    ste %f0, 0(%r2)
 ; CHECK-NEXT:    bcr 15, %r0
 ; CHECK-NEXT:    br %r14
   store atomic float %val, ptr %src seq_cst, align 4

>From 4d0e6f740720ed5e0d550a0b59af34d2894f562a Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 9 Jan 2024 13:40:12 -0600
Subject: [PATCH 2/3] Updates after review

---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 13 ++++--
 .../Target/SystemZ/SystemZISelDAGToDAG.cpp    |  7 +---
 .../Target/SystemZ/SystemZISelLowering.cpp    | 15 ++++---
 llvm/lib/Target/SystemZ/SystemZISelLowering.h |  4 +-
 llvm/lib/Target/SystemZ/SystemZInstrFP.td     |  8 ++--
 llvm/lib/Target/SystemZ/SystemZOperators.td   |  4 --
 llvm/test/CodeGen/SystemZ/atomic-memofolds.ll | 42 +++++++++++++++++--
 7 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 7642b6f4160a7b..698ec9c584c1d5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -343,12 +343,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
   if (N->getOpcode() == ISD::ATOMIC_LOAD) {
     ISD::LoadExtType ETy = cast<AtomicSDNode>(N)->getExtensionType();
     if (ETy == ISD::NON_EXTLOAD) {
-      if (TLI.getExtendForAtomicOps() == ISD::SIGN_EXTEND)
+      switch (TLI.getExtendForAtomicOps()) {
+      case ISD::SIGN_EXTEND:
         ETy = ISD::SEXTLOAD;
-      else if (TLI.getExtendForAtomicOps() == ISD::ZERO_EXTEND)
+        break;
+      case ISD::ZERO_EXTEND:
         ETy = ISD::ZEXTLOAD;
-      else
+        break;
+      case ISD::ANY_EXTEND:
         ETy = ISD::EXTLOAD;
+        break;
+      default:
+        llvm_unreachable("Invalid atomic op extension");
+      }
     }
     cast<AtomicSDNode>(Res)->setExtensionType(ETy);
   }
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index f4e6081ae82104..48b2999096fcd4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1516,13 +1516,10 @@ bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
   MachineMemOperand *MMO = MemAccess->getMemOperand();
   assert(MMO && "Expected a memory operand.");
 
-  // These instructions are not atomic.
-  if (MMO->isAtomic())
-    return false;
-
   // The memory access must have a proper alignment and no index register.
+  // ATOMIC_LOADs do not have the offset operand.
   if (MemAccess->getAlign().value() < StoreSize ||
-      !MemAccess->getOffset().isUndef())
+      (!MMO->isAtomic() && !MemAccess->getOffset().isUndef()))
     return false;
 
   // The MMO must not have an unaligned offset.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 39bd1aea6979c4..c8411ca577f11c 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -4515,7 +4515,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
-SDValue SystemZTargetLowering::lowerATOMIC_I128_LDST(SDValue Op,
+SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
                                                      SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
   assert(Node->getMemoryVT() == MVT::i128 && "Only custom lowering i128.");
@@ -5645,12 +5645,11 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
   return GS.getNode(DAG, SDLoc(BVN));
 }
 
-bool SystemZTargetLowering::isVectorElementLoad(SDValue Op, EVT VecVT) const {
+bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
   if (Op.getOpcode() == ISD::LOAD && cast<LoadSDNode>(Op)->isUnindexed())
     return true;
   if (auto *AL = dyn_cast<AtomicSDNode>(Op))
-    if (AL->getOpcode() == ISD::ATOMIC_LOAD && SDValue(AL, 0).hasOneUse() &&
-        AL->getMemoryVT() == VecVT.getScalarType())
+    if (AL->getOpcode() == ISD::ATOMIC_LOAD)
       return true;
   if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV)
     return true;
@@ -5689,13 +5688,13 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   //   we would need 2 instructions to replicate it: VLVGP followed by VREPx.
   //   This is only a win if the single defined element is used more than once.
   //   In other cases we're better off using a single VLVGx.
-  if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single, VT)))
+  if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single)))
     return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
 
   // If all elements are loads, use VLREP/VLEs (below).
   bool AllLoads = true;
   for (auto Elem : Elems)
-    if (!isVectorElementLoad(Elem, VT)) {
+    if (!isVectorElementLoad(Elem)) {
       AllLoads = false;
       break;
     }
@@ -5767,7 +5766,7 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
     std::map<const SDNode*, unsigned> UseCounts;
     SDNode *LoadMaxUses = nullptr;
     for (unsigned I = 0; I < NumElements; ++I)
-      if (isVectorElementLoad(Elems[I], VT)) {
+      if (isVectorElementLoad(Elems[I])) {
         SDNode *Ld = Elems[I].getNode();
         UseCounts[Ld]++;
         if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld])
@@ -6129,7 +6128,7 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
   case ISD::ATOMIC_STORE:
   case ISD::ATOMIC_LOAD:
-    return lowerATOMIC_I128_LDST(Op, DAG);
+    return lowerATOMIC_LDST_I128(Op, DAG);
   case ISD::ATOMIC_LOAD_ADD:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
   case ISD::ATOMIC_LOAD_SUB:
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 9c442268dbb111..e1ea069f30ba86 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -694,7 +694,7 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerATOMIC_I128_LDST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
                               unsigned Opcode) const;
   SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
@@ -704,7 +704,7 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-  bool isVectorElementLoad(SDValue Op, EVT VecVT) const;
+  bool isVectorElementLoad(SDValue Op) const;
   SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
                       SmallVectorImpl<SDValue> &Elems) const;
   SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 8cafa0a936a404..6e67425c1e788b 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -495,8 +495,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
   def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;
 
-  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, nonatomic_ld, 4>;
-  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, nonatomic_ld, 8>;
+  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
+  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
 }
 
 // Fused multiply-subtract.
@@ -504,8 +504,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MSEBR : TernaryRRD<"msebr", 0xB30F, z_any_fms, FP32, FP32>;
   def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_any_fms, FP64, FP64>;
 
-  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, nonatomic_ld, 4>;
-  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, nonatomic_ld, 8>;
+  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
+  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
 }
 
 // Division.
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 28815083daab0c..d98bb886c18506 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -607,10 +607,6 @@ def nonvolatile_anyextloadi8  : NonvolatileLoad<anyextloadi8>;
 def nonvolatile_anyextloadi16 : NonvolatileLoad<anyextloadi16>;
 def nonvolatile_anyextloadi32 : NonvolatileLoad<anyextloadi32>;
 
-def nonatomic_ld : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return !cast<LoadSDNode>(N)->isAtomic();
-}]>;
-
 // Non-volatile stores.
 class NonvolatileStore<SDPatternOperator store>
   : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
diff --git a/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll b/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll
index 56c1eb2b85a8d5..fa1578df04bec1 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll
@@ -170,12 +170,11 @@ define i64 @f14(i64 %a, ptr %src) {
   ret i64 %sub
 }
 
-; Check that maeb (reg/mem) is *not* used for an atomic load.
 define float @f15(float %f1, ptr %ptr, float %acc) {
 ; CHECK-LABEL: f15:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lde %f1, 0(%r2)
-; CHECK-NEXT:    wfmasb %f0, %f0, %f1, %f2
+; CHECK-NEXT:    maeb %f2, %f0, 0(%r2)
+; CHECK-NEXT:    ldr %f0, %f2
 ; CHECK-NEXT:    br %r14
   %f2 = load atomic float, ptr %ptr seq_cst, align 4
   %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
@@ -387,6 +386,39 @@ define void @f25_b(ptr %src, ptr %dst) {
   ret void
 }
 
+; Do *not* use vlrep for an extending load.
+define <4 x i32> @f25_c(ptr %ptr) {
+; CHECK-LABEL: f25_c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r0
+; CHECK-NEXT:    vrepf %v24, %v0, 1
+; CHECK-NEXT:    br %r14
+  %L = load atomic i8, ptr %ptr seq_cst, align 4
+  %S = sext i8 %L to i32
+  %val = insertelement <4 x i32> undef, i32 %S, i32 0
+  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+; Do *not* use vlrep if there is another scalar use.
+define <4 x i32> @f25_d(ptr %ptr, ptr %dst) {
+; CHECK-LABEL: f25_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    l %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r0
+; CHECK-NEXT:    vrepf %v24, %v0, 1
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %L = load atomic i32, ptr %ptr seq_cst, align 4
+  store i32 %L, ptr %dst, align 4
+  %val = insertelement <4 x i32> undef, i32 %L, i32 0
+  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
 define void @f26(ptr %src, ptr %dst) {
 ; CHECK-LABEL: f26:
 ; CHECK:       # %bb.0:
@@ -412,6 +444,8 @@ define void @f26_b(ptr %src, ptr %dst) {
   ret void
 }
 
+
+
 ; Vector Load logical element and zero.
 define <16 x i8> @f27(ptr %ptr) {
 ; CHECK-LABEL: f27:
@@ -607,7 +641,7 @@ define void @f43(ptr %ptr) {
 define void @f44(ptr %ptr) {
 ; CHECK-LABEL: f44:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    larl %r1, .LCPI48_0
+; CHECK-NEXT:    larl %r1, .LCPI50_0
 ; CHECK-NEXT:    ld %f0, 0(%r1)
 ; CHECK-NEXT:    std %f0, 0(%r2)
 ; CHECK-NEXT:    bcr 14, %r0

>From 30a96e3e4188bdbf5c4f4e371670d90c5c751c3f Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1 at linux.ibm.com>
Date: Tue, 6 Feb 2024 12:54:57 +0100
Subject: [PATCH 3/3] Try selecting atomic loads with patfrags again.

---
 .../include/llvm/Target/TargetSelectionDAG.td |   2 +-
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       |  12 +-
 llvm/lib/Target/PowerPC/PPCInstrP10.td        |  16 +-
 .../Target/SystemZ/SystemZISelDAGToDAG.cpp    |  68 ++---
 .../Target/SystemZ/SystemZISelLowering.cpp    |   4 +-
 llvm/lib/Target/SystemZ/SystemZInstrFP.td     |  46 ++--
 .../lib/Target/SystemZ/SystemZInstrFormats.td |   4 +-
 llvm/lib/Target/SystemZ/SystemZInstrHFP.td    |  54 ++--
 llvm/lib/Target/SystemZ/SystemZInstrInfo.td   | 250 +++++++++---------
 llvm/lib/Target/SystemZ/SystemZInstrVector.td |  40 +--
 llvm/lib/Target/SystemZ/SystemZOperators.td   | 194 ++++++++++----
 llvm/lib/Target/SystemZ/SystemZPatterns.td    |   4 +-
 llvm/lib/Target/VE/VEInstrInfo.td             |   8 +-
 llvm/test/CodeGen/SystemZ/atomic-load-06.ll   |   2 +-
 .../CodeGen/SystemZ/atomic-memops-fp128.ll    |  31 +++
 .../{atomic-memofolds.ll => atomic-memops.ll} |  52 +++-
 16 files changed, 477 insertions(+), 310 deletions(-)
 create mode 100644 llvm/test/CodeGen/SystemZ/atomic-memops-fp128.ll
 rename llvm/test/CodeGen/SystemZ/{atomic-memofolds.ll => atomic-memops.ll} (94%)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 22360353790dbc..0b156ab6c64983 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -318,7 +318,7 @@ def SDTAtomicStore : SDTypeProfile<0, 2, [
   SDTCisInt<0>, SDTCisPtrTy<1>
 ]>;
 def SDTAtomicLoad : SDTypeProfile<1, 1, [
-  SDTCisInt<0>, SDTCisPtrTy<1>
+  SDTCisPtrTy<1>
 ]>;
 
 class SDCallSeqStart<list<SDTypeConstraint> constraints> :
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 3abd97f2c38c09..70c2fc74818717 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -5035,12 +5035,12 @@ defm : TrapExtendedMnemonic<"lng", 6>;
 defm : TrapExtendedMnemonic<"u", 31>;
 
 // Atomic loads
-def : Pat<(atomic_load_8  DForm:$src), (LBZ  memri:$src)>;
-def : Pat<(atomic_load_16 DForm:$src), (LHZ  memri:$src)>;
-def : Pat<(atomic_load_32 DForm:$src), (LWZ  memri:$src)>;
-def : Pat<(atomic_load_8  XForm:$src), (LBZX memrr:$src)>;
-def : Pat<(atomic_load_16 XForm:$src), (LHZX memrr:$src)>;
-def : Pat<(atomic_load_32 XForm:$src), (LWZX memrr:$src)>;
+def : Pat<(iAny (atomic_load_8  DForm:$src)), (LBZ  memri:$src)>;
+def : Pat<(iAny (atomic_load_16 DForm:$src)), (LHZ  memri:$src)>;
+def : Pat<(iAny (atomic_load_32 DForm:$src)), (LWZ  memri:$src)>;
+def : Pat<(iAny (atomic_load_8  XForm:$src)), (LBZX memrr:$src)>;
+def : Pat<(iAny (atomic_load_16 XForm:$src)), (LHZX memrr:$src)>;
+def : Pat<(iAny (atomic_load_32 XForm:$src)), (LWZX memrr:$src)>;
 
 // Atomic stores
 def : Pat<(atomic_store_8  i32:$val, DForm:$ptr), (STB  gprc:$val, memri:$ptr)>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index d5a372e4dc1010..41068f86935f8a 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -1289,13 +1289,13 @@ let Predicates = [PCRelativeMemops] in {
             (PSTXVpc $XS, $ga, 0)>;
 
   // Atomic Load
-  def : Pat<(atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(iAny (atomic_load_8 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLBZpc $ga, 0)>;
-  def : Pat<(atomic_load_16 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(iAny (atomic_load_16 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLHZpc $ga, 0)>;
-  def : Pat<(atomic_load_32 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(iAny (atomic_load_32 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLWZpc $ga, 0)>;
-  def : Pat<(atomic_load_64 (PPCmatpcreladdr PCRelForm:$ga)),
+  def : Pat<(iAny (atomic_load_64 (PPCmatpcreladdr PCRelForm:$ga))),
             (PLDpc $ga, 0)>;
 
   // Atomic Store
@@ -2347,10 +2347,10 @@ let Predicates = [PrefixInstrs] in {
   def : Pat<(store f64:$FRS, PDForm:$dst), (PSTFD $FRS, memri34:$dst)>;
 
   // Atomic Load
-  def : Pat<(atomic_load_8 PDForm:$src), (PLBZ memri34:$src)>;
-  def : Pat<(atomic_load_16 PDForm:$src), (PLHZ memri34:$src)>;
-  def : Pat<(atomic_load_32 PDForm:$src), (PLWZ memri34:$src)>;
-  def : Pat<(atomic_load_64 PDForm:$src), (PLD memri34:$src)>;
+  def : Pat<(iAny (atomic_load_8 PDForm:$src)), (PLBZ memri34:$src)>;
+  def : Pat<(iAny (atomic_load_16 PDForm:$src)), (PLHZ memri34:$src)>;
+  def : Pat<(iAny (atomic_load_32 PDForm:$src)), (PLWZ memri34:$src)>;
+  def : Pat<(iAny (atomic_load_64 PDForm:$src)), (PLD memri34:$src)>;
 
   // Atomic Store
   def : Pat<(atomic_store_8 i32:$RS, PDForm:$dst), (PSTB $RS, memri34:$dst)>;
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 48b2999096fcd4..cc1cbb23ccde4b 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -344,12 +344,12 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // requirements for a PC-relative access.
   bool storeLoadIsAligned(SDNode *N) const;
 
+  // Return the load extension type of a load or atomic load.
+  ISD::LoadExtType getLoadExtType(SDNode *N) const;
+
   // Try to expand a boolean SELECT_CCMASK using an IPM sequence.
   SDValue expandSelectBoolean(SDNode *Node);
 
-  // Convert ATOMIC_LOADs to LOADs to facilitate instruction selection.
-  void convertATOMIC_LOADs(SDNode *Node, unsigned Depth = 0);
-
 public:
   static char ID;
 
@@ -1510,16 +1510,17 @@ bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
 
 bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
 
-  auto *MemAccess = cast<LSBaseSDNode>(N);
+  auto *MemAccess = cast<MemSDNode>(N);
+  auto *LdSt = dyn_cast<LSBaseSDNode>(MemAccess);
   TypeSize StoreSize = MemAccess->getMemoryVT().getStoreSize();
   SDValue BasePtr = MemAccess->getBasePtr();
   MachineMemOperand *MMO = MemAccess->getMemOperand();
   assert(MMO && "Expected a memory operand.");
 
   // The memory access must have a proper alignment and no index register.
-  // ATOMIC_LOADs do not have the offset operand.
+  // Only load and store nodes have the offset operand (atomic loads do not).
   if (MemAccess->getAlign().value() < StoreSize ||
-      (!MMO->isAtomic() && !MemAccess->getOffset().isUndef()))
+      (LdSt && !LdSt->getOffset().isUndef()))
     return false;
 
   // The MMO must not have an unaligned offset.
@@ -1549,35 +1550,15 @@ bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
   return true;
 }
 
-// This is a hack to convert ATOMIC_LOADs to LOADs in the last minute just
-// before instruction selection begins. It would have been easier if
-// ATOMIC_LOAD nodes would instead always be built by SelectionDAGBuilder as
-// LOADs with an atomic MMO and properly handled as such in DAGCombiner, but
-// until that changes they need to remain as ATOMIC_LOADs until all
-// DAGCombining is done.  Convert Node or any of its operands from
-// ATOMIC_LOAD to LOAD.
-void SystemZDAGToDAGISel::convertATOMIC_LOADs(SDNode *Node, unsigned Depth) {
-  if (Depth > 1) // Chain operands are also followed so this seems enough.
-    return;
-  if (Node->getOpcode() == ISD::ATOMIC_LOAD) {
-    auto *ALoad = cast<AtomicSDNode>(Node);
-    // It seems necessary to morph the node as it is not yet being selected.
-    LoadSDNode *Ld = cast<LoadSDNode>(CurDAG->MorphNodeTo(
-        ALoad, ISD::LOAD, CurDAG->getVTList(ALoad->getValueType(0), MVT::Other),
-        {ALoad->getChain(), ALoad->getBasePtr()}));
-    // Sanity check the morph.  The extension type for an extending load
-    // should have been set prior to instruction selection and remain in the
-    // morphed node.
-    assert(((SDNode *)Ld) == ((SDNode *)ALoad) && "Bad CSE on atomic load.");
-    assert(Ld->getMemOperand()->isAtomic() && "Broken MMO.");
-    ISD::LoadExtType ETy = Ld->getExtensionType();
-    bool IsNonExt = Ld->getMemoryVT().getSizeInBits() ==
-                    Ld->getValueType(0).getSizeInBits();
-    assert(IsNonExt == (ETy == ISD::NON_EXTLOAD) && "Bad extension type.");
-    return;
-  }
-  for (SDValue Op : Node->ops())
-    convertATOMIC_LOADs(Op.getNode(), ++Depth);
+ISD::LoadExtType SystemZDAGToDAGISel::getLoadExtType(SDNode *N) const {
+  ISD::LoadExtType ETy;
+  if (auto *L = dyn_cast<LoadSDNode>(N))
+    ETy = L->getExtensionType();
+  else if (auto *AL = dyn_cast<AtomicSDNode>(N))
+    ETy = AL->getExtensionType();
+  else
+    llvm_unreachable("Unkown load node type.");
+  return ETy;
 }
 
 void SystemZDAGToDAGISel::Select(SDNode *Node) {
@@ -1588,9 +1569,6 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
     return;
   }
 
-  // Prepare any ATOMIC_LOAD to be selected as a LOAD with an atomic MMO.
-  convertATOMIC_LOADs(Node);
-
   unsigned Opcode = Node->getOpcode();
   switch (Opcode) {
   case ISD::OR:
@@ -1783,7 +1761,8 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
 
   case ISD::ATOMIC_STORE: {
     auto *AtomOp = cast<AtomicSDNode>(Node);
-    // Store FP values directly without first moving to a GPR.
+    // Store FP values directly without first moving to a GPR. This is needed
+    // as long as clang always emits the cast to integer.
     EVT SVT = AtomOp->getMemoryVT();
     SDValue StoredVal = AtomOp->getVal();
     if (SVT.isInteger() && StoredVal->getOpcode() == ISD::BITCAST &&
@@ -1791,6 +1770,9 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
       StoredVal = StoredVal->getOperand(0);
       SVT = StoredVal.getValueType();
     }
+    // Replace the atomic_store with a regular store and select it. This is
+    // ok since we know all store instructions <= 8 bytes are atomic, and the
+    // 16 byte case is already handled during lowering.
     StoreSDNode *St = cast<StoreSDNode>(CurDAG->getTruncStore(
         AtomOp->getChain(), SDLoc(AtomOp), StoredVal, AtomOp->getBasePtr(), SVT,
         AtomOp->getMemOperand()));
@@ -1807,6 +1789,14 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {
   }
   }
 
+#ifndef NDEBUG
+  if (auto *AL = dyn_cast<AtomicSDNode>(Node))
+    if (AL->getOpcode() == ISD::ATOMIC_LOAD)
+      assert((AL->getExtensionType() == ISD::NON_EXTLOAD ||
+              AL->getMemoryVT().isScalarInteger()) &&
+             "Not expecting extending fp atomic_load nodes.");
+#endif
+
   SelectCode(Node);
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c8411ca577f11c..75fd61461ecb15 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -916,6 +916,7 @@ bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const
   return false;
 }
 
+// FIXME: Clang emits these casts always regardless of these hooks.
 TargetLowering::AtomicExpansionKind
 SystemZTargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
   // Lower fp128 the same way as i128.
@@ -6604,7 +6605,8 @@ SDValue SystemZTargetLowering::combineBITCAST(SDNode *N,
   EVT ResVT = N->getValueType(0);
   // Handle atomic loads to load float/double values directly and not via a
   // GPR. Do it before legalization to help in treating the ATOMIC_LOAD the
-  // same way as a LOAD, and e.g. emit a REPLICATE.
+  // same way as a LOAD, and e.g. emit a REPLICATE. FIXME: This is only
+  // needed because clang currently emits these casts always.
   if (auto *ALoad = dyn_cast<AtomicSDNode>(N0))
     if (ALoad->getOpcode() == ISD::ATOMIC_LOAD && InVT.getSizeInBits() <= 64 &&
         ALoad->getExtensionType() == ISD::NON_EXTLOAD &&
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 6e67425c1e788b..f4b5aeaebef923 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -129,8 +129,8 @@ defm LoadStoreF128 : MVCLoadStore<load, f128, MVCImm, 15>;
 //===----------------------------------------------------------------------===//
 
 let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
-  defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
-  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
+  defm LE : UnaryRXPair<"le", 0x78, 0xED64, z_load, FP32, 4>;
+  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, z_load, FP64, 8>;
 
   // For z13 we prefer LDE over LE to avoid partial register dependencies.
   let isCodeGenOnly = 1 in
@@ -200,14 +200,14 @@ let Predicates = [FeatureNoVectorEnhancements1] in {
 
 // Extend memory floating-point values to wider representations.
 let Uses = [FPC], mayRaiseFPException = 1 in {
-  def LDEB : UnaryRXE<"ldeb", 0xED04, any_extloadf32, FP64, 4>;
+  def LDEB : UnaryRXE<"ldeb", 0xED04, z_any_extloadf32, FP64, 4>;
   def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>;
   def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>;
 }
 let Predicates = [FeatureNoVectorEnhancements1] in {
-  def : Pat<(f128 (any_extloadf32 bdxaddr12only:$src)),
+  def : Pat<(f128 (z_any_extloadf32 bdxaddr12only:$src)),
             (LXEB bdxaddr12only:$src)>;
-  def : Pat<(f128 (any_extloadf64 bdxaddr12only:$src)),
+  def : Pat<(f128 (z_any_extloadf64 bdxaddr12only:$src)),
             (LXDB bdxaddr12only:$src)>;
 }
 
@@ -430,8 +430,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
     def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64,  FP64>;
     def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
   }
-  defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, load, 4>;
-  defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, load, 8>;
+  defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_load, 4>;
+  defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_load, 8>;
 }
 
 // Subtraction.
@@ -441,8 +441,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
   def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64,  FP64>;
   def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;
 
-  defm SEB : BinaryRXEAndPseudo<"seb",  0xED0B, any_fsub, FP32, load, 4>;
-  defm SDB : BinaryRXEAndPseudo<"sdb",  0xED1B, any_fsub, FP64, load, 8>;
+  defm SEB : BinaryRXEAndPseudo<"seb",  0xED0B, any_fsub, FP32, z_load, 4>;
+  defm SDB : BinaryRXEAndPseudo<"sdb",  0xED1B, any_fsub, FP64, z_load, 8>;
 }
 
 // Multiplication.
@@ -452,8 +452,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
     def MDBR  : BinaryRRE<"mdbr",  0xB31C, any_fmul, FP64,  FP64>;
     def MXBR  : BinaryRRE<"mxbr",  0xB34C, any_fmul, FP128, FP128>;
   }
-  defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, load, 4>;
-  defm MDB  : BinaryRXEAndPseudo<"mdb",  0xED1C, any_fmul, FP64, load, 8>;
+  defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_load, 4>;
+  defm MDB  : BinaryRXEAndPseudo<"mdb",  0xED1C, any_fmul, FP64, z_load, 8>;
 }
 
 // f64 multiplication of two FP32 registers.
@@ -466,7 +466,7 @@ def : Pat<(any_fmul (f64 (any_fpextend FP32:$src1)),
 
 // f64 multiplication of an FP32 register and an f32 memory.
 let Uses = [FPC], mayRaiseFPException = 1 in
-  def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
+  def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, z_load, 4>;
 def : Pat<(any_fmul (f64 (any_fpextend FP32:$src1)),
                     (f64 (any_extloadf32 bdxaddr12only:$addr))),
           (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32),
@@ -483,7 +483,7 @@ let Predicates = [FeatureNoVectorEnhancements1] in
 
 // f128 multiplication of an FP64 register and an f64 memory.
 let Uses = [FPC], mayRaiseFPException = 1 in
-  def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
+  def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, z_load, 8>;
 let Predicates = [FeatureNoVectorEnhancements1] in
   def : Pat<(any_fmul (f128 (any_fpextend FP64:$src1)),
                       (f128 (any_extloadf64 bdxaddr12only:$addr))),
@@ -495,8 +495,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
   def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;
 
-  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
-  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
+  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, z_load, 4>;
+  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, z_load, 8>;
 }
 
 // Fused multiply-subtract.
@@ -504,8 +504,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MSEBR : TernaryRRD<"msebr", 0xB30F, z_any_fms, FP32, FP32>;
   def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_any_fms, FP64, FP64>;
 
-  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
-  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
+  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, z_load, 4>;
+  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, z_load, 8>;
 }
 
 // Division.
@@ -514,8 +514,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def DDBR : BinaryRRE<"ddbr", 0xB31D, any_fdiv, FP64,  FP64>;
   def DXBR : BinaryRRE<"dxbr", 0xB34D, any_fdiv, FP128, FP128>;
 
-  defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, load, 4>;
-  defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, load, 8>;
+  defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, z_load, 4>;
+  defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, z_load, 8>;
 }
 
 // Divide to integer.
@@ -533,15 +533,15 @@ let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC], CCValues = 0xF in {
   def CDBR : CompareRRE<"cdbr", 0xB319, z_any_fcmp, FP64,  FP64>;
   def CXBR : CompareRRE<"cxbr", 0xB349, z_any_fcmp, FP128, FP128>;
 
-  def CEB : CompareRXE<"ceb", 0xED09, z_any_fcmp, FP32, load, 4>;
-  def CDB : CompareRXE<"cdb", 0xED19, z_any_fcmp, FP64, load, 8>;
+  def CEB : CompareRXE<"ceb", 0xED09, z_any_fcmp, FP32, z_load, 4>;
+  def CDB : CompareRXE<"cdb", 0xED19, z_any_fcmp, FP64, z_load, 8>;
 
   def KEBR : CompareRRE<"kebr", 0xB308, z_strict_fcmps, FP32,  FP32>;
   def KDBR : CompareRRE<"kdbr", 0xB318, z_strict_fcmps, FP64,  FP64>;
   def KXBR : CompareRRE<"kxbr", 0xB348, z_strict_fcmps, FP128, FP128>;
 
-  def KEB : CompareRXE<"keb", 0xED08, z_strict_fcmps, FP32, load, 4>;
-  def KDB : CompareRXE<"kdb", 0xED18, z_strict_fcmps, FP64, load, 8>;
+  def KEB : CompareRXE<"keb", 0xED08, z_strict_fcmps, FP32, z_load, 4>;
+  def KDB : CompareRXE<"kdb", 0xED18, z_strict_fcmps, FP64, z_load, 8>;
 }
 
 // Test Data Class.
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index bb9fa0fc33ffa0..3dba33b66bf4f4 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -3777,7 +3777,7 @@ class BinarySI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                Operand imm, AddressingMode mode = bdaddr12only>
   : InstSI<opcode, (outs), (ins (mode $B1, $D1):$BD1, imm:$I2),
            mnemonic#"\t$BD1, $I2",
-           [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+           [(store (operator (z_load mode:$BD1), imm:$I2), mode:$BD1)]> {
   let mayLoad = 1;
   let mayStore = 1;
 }
@@ -3786,7 +3786,7 @@ class BinarySIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 Operand imm, AddressingMode mode = bdaddr20only>
   : InstSIY<opcode, (outs), (ins (mode $B1, $D1):$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
-            [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
+            [(store (operator (z_load mode:$BD1), imm:$I2), mode:$BD1)]> {
   let mayLoad = 1;
   let mayStore = 1;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrHFP.td b/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
index 2e3c9932d62142..d2e05b63c6c630 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
@@ -134,8 +134,8 @@ let Defs = [CC] in {
     def ADR : BinaryRR<"adr", 0x2A, null_frag, FP64,  FP64>;
     def AXR : BinaryRR<"axr", 0x36, null_frag, FP128, FP128>;
   }
-  def AE : BinaryRX<"ae", 0x7A, null_frag, FP32, load, 4>;
-  def AD : BinaryRX<"ad", 0x6A, null_frag, FP64, load, 8>;
+  def AE : BinaryRX<"ae", 0x7A, null_frag, FP32, z_load, 4>;
+  def AD : BinaryRX<"ad", 0x6A, null_frag, FP64, z_load, 8>;
 }
 
 // Addition (unnormalized).
@@ -144,8 +144,8 @@ let Defs = [CC] in {
     def AUR : BinaryRR<"aur", 0x3E, null_frag, FP32, FP32>;
     def AWR : BinaryRR<"awr", 0x2E, null_frag, FP64, FP64>;
   }
-  def AU : BinaryRX<"au", 0x7E, null_frag, FP32, load, 4>;
-  def AW : BinaryRX<"aw", 0x6E, null_frag, FP64, load, 8>;
+  def AU : BinaryRX<"au", 0x7E, null_frag, FP32, z_load, 4>;
+  def AW : BinaryRX<"aw", 0x6E, null_frag, FP64, z_load, 8>;
 }
 
 // Subtraction.
@@ -154,8 +154,8 @@ let Defs = [CC] in {
   def SDR : BinaryRR<"sdr", 0x2B, null_frag, FP64,  FP64>;
   def SXR : BinaryRR<"sxr", 0x37, null_frag, FP128, FP128>;
 
-  def SE : BinaryRX<"se", 0x7B, null_frag, FP32, load, 4>;
-  def SD : BinaryRX<"sd", 0x6B, null_frag, FP64, load, 8>;
+  def SE : BinaryRX<"se", 0x7B, null_frag, FP32, z_load, 4>;
+  def SD : BinaryRX<"sd", 0x6B, null_frag, FP64, z_load, 8>;
 }
 
 // Subtraction (unnormalized).
@@ -163,8 +163,8 @@ let Defs = [CC] in {
   def SUR : BinaryRR<"sur", 0x3F, null_frag, FP32, FP32>;
   def SWR : BinaryRR<"swr", 0x2F, null_frag, FP64, FP64>;
 
-  def SU : BinaryRX<"su", 0x7F, null_frag, FP32, load, 4>;
-  def SW : BinaryRX<"sw", 0x6F, null_frag, FP64, load, 8>;
+  def SU : BinaryRX<"su", 0x7F, null_frag, FP32, z_load, 4>;
+  def SW : BinaryRX<"sw", 0x6F, null_frag, FP64, z_load, 8>;
 }
 
 // Multiplication.
@@ -173,55 +173,55 @@ let isCommutable = 1 in {
   def MDR  : BinaryRR <"mdr",  0x2C,   null_frag, FP64,  FP64>;
   def MXR  : BinaryRR <"mxr",  0x26,   null_frag, FP128, FP128>;
 }
-def MEE : BinaryRXE<"mee", 0xED37, null_frag, FP32, load, 4>;
-def MD  : BinaryRX <"md",  0x6C,   null_frag, FP64, load, 8>;
+def MEE : BinaryRXE<"mee", 0xED37, null_frag, FP32, z_load, 4>;
+def MD  : BinaryRX <"md",  0x6C,   null_frag, FP64, z_load, 8>;
 
 // Extending multiplication (f32 x f32 -> f64).
 def MDER : BinaryRR<"mder", 0x3C, null_frag, FP64, FP32>;
-def MDE  : BinaryRX<"mde",  0x7C, null_frag, FP64, load, 4>;
+def MDE  : BinaryRX<"mde",  0x7C, null_frag, FP64, z_load, 4>;
 let isAsmParserOnly = 1 in {
   def MER : BinaryRR<"mer", 0x3C, null_frag, FP64, FP32>;
-  def ME  : BinaryRX<"me",  0x7C, null_frag, FP64, load, 4>;
+  def ME  : BinaryRX<"me",  0x7C, null_frag, FP64, z_load, 4>;
 }
 
 // Extending multiplication (f64 x f64 -> f128).
 def MXDR : BinaryRR<"mxdr", 0x27, null_frag, FP128, FP64>;
-def MXD  : BinaryRX<"mxd",  0x67, null_frag, FP128, load, 8>;
+def MXD  : BinaryRX<"mxd",  0x67, null_frag, FP128, z_load, 8>;
 
 // Fused multiply-add.
 def MAER : TernaryRRD<"maer", 0xB32E, null_frag, FP32, FP32>;
 def MADR : TernaryRRD<"madr", 0xB33E, null_frag, FP64, FP64>;
-def MAE  : TernaryRXF<"mae",  0xED2E, null_frag, FP32, FP32, load, 4>;
-def MAD  : TernaryRXF<"mad",  0xED3E, null_frag, FP64, FP64, load, 8>;
+def MAE  : TernaryRXF<"mae",  0xED2E, null_frag, FP32, FP32, z_load, 4>;
+def MAD  : TernaryRXF<"mad",  0xED3E, null_frag, FP64, FP64, z_load, 8>;
 
 // Fused multiply-subtract.
 def MSER : TernaryRRD<"mser", 0xB32F, null_frag, FP32, FP32>;
 def MSDR : TernaryRRD<"msdr", 0xB33F, null_frag, FP64, FP64>;
-def MSE  : TernaryRXF<"mse",  0xED2F, null_frag, FP32, FP32, load, 4>;
-def MSD  : TernaryRXF<"msd",  0xED3F, null_frag, FP64, FP64, load, 8>;
+def MSE  : TernaryRXF<"mse",  0xED2F, null_frag, FP32, FP32, z_load, 4>;
+def MSD  : TernaryRXF<"msd",  0xED3F, null_frag, FP64, FP64, z_load, 8>;
 
 // Multiplication (unnormalized).
 def MYR  : BinaryRRD<"myr",  0xB33B, null_frag, FP128, FP64>;
 def MYHR : BinaryRRD<"myhr", 0xB33D, null_frag, FP64,  FP64>;
 def MYLR : BinaryRRD<"mylr", 0xB339, null_frag, FP64,  FP64>;
-def MY   : BinaryRXF<"my",   0xED3B, null_frag, FP128, FP64, load, 8>;
-def MYH  : BinaryRXF<"myh",  0xED3D, null_frag, FP64,  FP64, load, 8>;
-def MYL  : BinaryRXF<"myl",  0xED39, null_frag, FP64,  FP64, load, 8>;
+def MY   : BinaryRXF<"my",   0xED3B, null_frag, FP128, FP64, z_load, 8>;
+def MYH  : BinaryRXF<"myh",  0xED3D, null_frag, FP64,  FP64, z_load, 8>;
+def MYL  : BinaryRXF<"myl",  0xED39, null_frag, FP64,  FP64, z_load, 8>;
 
 // Fused multiply-add (unnormalized).
 def MAYR  : TernaryRRD<"mayr",  0xB33A, null_frag, FP128, FP64>;
 def MAYHR : TernaryRRD<"mayhr", 0xB33C, null_frag, FP64,  FP64>;
 def MAYLR : TernaryRRD<"maylr", 0xB338, null_frag, FP64,  FP64>;
-def MAY   : TernaryRXF<"may",   0xED3A, null_frag, FP128, FP64, load, 8>;
-def MAYH  : TernaryRXF<"mayh",  0xED3C, null_frag, FP64,  FP64, load, 8>;
-def MAYL  : TernaryRXF<"mayl",  0xED38, null_frag, FP64,  FP64, load, 8>;
+def MAY   : TernaryRXF<"may",   0xED3A, null_frag, FP128, FP64, z_load, 8>;
+def MAYH  : TernaryRXF<"mayh",  0xED3C, null_frag, FP64,  FP64, z_load, 8>;
+def MAYL  : TernaryRXF<"mayl",  0xED38, null_frag, FP64,  FP64, z_load, 8>;
 
 // Division.
 def DER : BinaryRR <"der", 0x3D,   null_frag, FP32,  FP32>;
 def DDR : BinaryRR <"ddr", 0x2D,   null_frag, FP64,  FP64>;
 def DXR : BinaryRRE<"dxr", 0xB22D, null_frag, FP128, FP128>;
-def DE  : BinaryRX <"de",  0x7D,   null_frag, FP32, load, 4>;
-def DD  : BinaryRX <"dd",  0x6D,   null_frag, FP64, load, 8>;
+def DE  : BinaryRX <"de",  0x7D,   null_frag, FP32, z_load, 4>;
+def DD  : BinaryRX <"dd",  0x6D,   null_frag, FP64, z_load, 8>;
 
 
 //===----------------------------------------------------------------------===//
@@ -233,7 +233,7 @@ let Defs = [CC] in {
   def CDR : CompareRR <"cdr", 0x29,   null_frag, FP64,  FP64>;
   def CXR : CompareRRE<"cxr", 0xB369, null_frag, FP128, FP128>;
 
-  def CE : CompareRX<"ce", 0x79, null_frag, FP32, load, 4>;
-  def CD : CompareRX<"cd", 0x69, null_frag, FP64, load, 8>;
+  def CE : CompareRX<"ce", 0x79, null_frag, FP32, z_load, 4>;
+  def CD : CompareRX<"cd", 0x69, null_frag, FP64, z_load, 8>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 937e36057a6ede..96ea65b6c3d881 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -306,7 +306,7 @@ let Predicates = [IsTargetXPLINK64] in {
   let mayLoad = 1, AddedComplexity = 20, hasNoSchedulingInfo = 1, Defs = [CC] in {
     def ADA_ENTRY_VALUE : Alias<12, (outs GR64:$Reg), (ins adasym:$addr,
                               ADDR64:$ADA, imm64:$Offset),
-                            [(set i64:$Reg, (load (z_ada_entry
+                            [(set i64:$Reg, (z_load (z_ada_entry
                               iPTR:$addr, iPTR:$ADA, i64:$Offset)))]>;
  }
 }
@@ -468,12 +468,12 @@ let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
 // Register loads.
 let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
   // Expands to L, LY or LFH, depending on the choice of register.
-  def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>,
+  def LMux : UnaryRXYPseudo<"l", z_load, GRX32, 4>,
              Requires<[FeatureHighWord]>;
-  defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32, 4>;
-  def LFH : UnaryRXY<"lfh", 0xE3CA, load, GRH32, 4>,
+  defm L : UnaryRXPair<"l", 0x58, 0xE358, z_load, GR32, 4>;
+  def LFH : UnaryRXY<"lfh", 0xE3CA, z_load, GRH32, 4>,
             Requires<[FeatureHighWord]>;
-  def LG : UnaryRXY<"lg", 0xE304, load, GR64, 8>;
+  def LG : UnaryRXY<"lg", 0xE304, z_load, GR64, 8>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -483,22 +483,22 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1, mayLoad = 1 in {
   }
 }
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
-  def LT  : UnaryRXY<"lt",  0xE312, load, GR32, 4>;
-  def LTG : UnaryRXY<"ltg", 0xE302, load, GR64, 8>;
+  def LT  : UnaryRXY<"lt",  0xE312, z_load, GR32, 4>;
+  def LTG : UnaryRXY<"ltg", 0xE302, z_load, GR64, 8>;
 }
 
 let canFoldAsLoad = 1 in {
-  def LRL  : UnaryRILPC<"lrl",  0xC4D, aligned_load, GR32>;
-  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+  def LRL  : UnaryRILPC<"lrl",  0xC4D, aligned_z_load, GR32>;
+  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_z_load, GR64>;
 }
 
 // Load and zero rightmost byte.
 let Predicates = [FeatureLoadAndZeroRightmostByte] in {
   def LZRF : UnaryRXY<"lzrf", 0xE33B, null_frag, GR32, 4>;
   def LZRG : UnaryRXY<"lzrg", 0xE32A, null_frag, GR64, 8>;
-  def : Pat<(and (i32 (load bdxaddr20only:$src)), 0xffffff00),
+  def : Pat<(and (i32 (z_load bdxaddr20only:$src)), 0xffffff00),
             (LZRF bdxaddr20only:$src)>;
-  def : Pat<(and (i64 (load bdxaddr20only:$src)), 0xffffffffffffff00),
+  def : Pat<(and (i64 (z_load bdxaddr20only:$src)), 0xffffffffffffff00),
             (LZRG bdxaddr20only:$src)>;
 }
 
@@ -689,29 +689,29 @@ def : Pat<(sext_inreg GR64:$src, i32),
 
 // 32-bit extensions from 8-bit memory.  LBMux expands to LB or LBH,
 // depending on the choice of register.
-def LBMux : UnaryRXYPseudo<"lb", asextloadi8, GRX32, 1>,
+def LBMux : UnaryRXYPseudo<"lb", z_asextloadi8, GRX32, 1>,
             Requires<[FeatureHighWord]>;
-def LB  : UnaryRXY<"lb", 0xE376, asextloadi8, GR32, 1>;
-def LBH : UnaryRXY<"lbh", 0xE3C0, asextloadi8, GRH32, 1>,
+def LB  : UnaryRXY<"lb", 0xE376, z_asextloadi8, GR32, 1>;
+def LBH : UnaryRXY<"lbh", 0xE3C0, z_asextloadi8, GRH32, 1>,
           Requires<[FeatureHighWord]>;
 
 // 32-bit extensions from 16-bit memory.  LHMux expands to LH or LHH,
 // depending on the choice of register.
-def LHMux : UnaryRXYPseudo<"lh", asextloadi16, GRX32, 2>,
+def LHMux : UnaryRXYPseudo<"lh", z_asextloadi16, GRX32, 2>,
             Requires<[FeatureHighWord]>;
-defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, asextloadi16, GR32, 2>;
-def  LHH  : UnaryRXY<"lhh", 0xE3C4, asextloadi16, GRH32, 2>,
+defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, z_asextloadi16, GR32, 2>;
+def  LHH  : UnaryRXY<"lhh", 0xE3C4, z_asextloadi16, GRH32, 2>,
             Requires<[FeatureHighWord]>;
-def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_asextloadi16, GR32>;
+def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_z_asextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LGB   : UnaryRXY<"lgb", 0xE377, asextloadi8,  GR64, 1>;
-def LGH   : UnaryRXY<"lgh", 0xE315, asextloadi16, GR64, 2>;
-def LGF   : UnaryRXY<"lgf", 0xE314, asextloadi32, GR64, 4>;
-def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_asextloadi16, GR64>;
-def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_asextloadi32, GR64>;
+def LGB   : UnaryRXY<"lgb", 0xE377, z_asextloadi8,  GR64, 1>;
+def LGH   : UnaryRXY<"lgh", 0xE315, z_asextloadi16, GR64, 2>;
+def LGF   : UnaryRXY<"lgf", 0xE314, z_asextloadi32, GR64, 4>;
+def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_z_asextloadi16, GR64>;
+def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_z_asextloadi32, GR64>;
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
-  def LTGF : UnaryRXY<"ltgf", 0xE332, asextloadi32, GR64, 4>;
+  def LTGF : UnaryRXY<"ltgf", 0xE332, z_asextloadi32, GR64, 4>;
 
 //===----------------------------------------------------------------------===//
 // Zero extensions
@@ -740,40 +740,40 @@ def : Pat<(and GR64:$src, 0xffffffff),
 
 // 32-bit extensions from 8-bit memory.  LLCMux expands to LLC or LLCH,
 // depending on the choice of register.
-def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
+def LLCMux : UnaryRXYPseudo<"llc", z_azextloadi8, GRX32, 1>,
              Requires<[FeatureHighWord]>;
-def LLC  : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
-def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GRH32, 1>,
+def LLC  : UnaryRXY<"llc", 0xE394, z_azextloadi8, GR32, 1>;
+def LLCH : UnaryRXY<"llch", 0xE3C2, z_azextloadi8, GRH32, 1>,
            Requires<[FeatureHighWord]>;
 
 // 32-bit extensions from 16-bit memory.  LLHMux expands to LLH or LLHH,
 // depending on the choice of register.
-def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
+def LLHMux : UnaryRXYPseudo<"llh", z_azextloadi16, GRX32, 2>,
              Requires<[FeatureHighWord]>;
-def LLH   : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
-def LLHH  : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GRH32, 2>,
+def LLH   : UnaryRXY<"llh", 0xE395, z_azextloadi16, GR32, 2>;
+def LLHH  : UnaryRXY<"llhh", 0xE3C6, z_azextloadi16, GRH32, 2>,
             Requires<[FeatureHighWord]>;
-def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
+def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_z_azextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LLGC   : UnaryRXY<"llgc", 0xE390, azextloadi8,  GR64, 1>;
-def LLGH   : UnaryRXY<"llgh", 0xE391, azextloadi16, GR64, 2>;
-def LLGF   : UnaryRXY<"llgf", 0xE316, azextloadi32, GR64, 4>;
-def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_azextloadi16, GR64>;
-def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_azextloadi32, GR64>;
+def LLGC   : UnaryRXY<"llgc", 0xE390, z_azextloadi8,  GR64, 1>;
+def LLGH   : UnaryRXY<"llgh", 0xE391, z_azextloadi16, GR64, 2>;
+def LLGF   : UnaryRXY<"llgf", 0xE316, z_azextloadi32, GR64, 4>;
+def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_z_azextloadi16, GR64>;
+def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_z_azextloadi32, GR64>;
 
 // 31-to-64-bit zero extensions.
 def LLGTR : UnaryRRE<"llgtr", 0xB917, null_frag, GR64, GR64>;
 def LLGT  : UnaryRXY<"llgt",  0xE317, null_frag, GR64, 4>;
 def : Pat<(and GR64:$src, 0x7fffffff),
           (LLGTR GR64:$src)>;
-def : Pat<(and (i64 (azextloadi32 bdxaddr20only:$src)), 0x7fffffff),
+def : Pat<(and (i64 (z_azextloadi32 bdxaddr20only:$src)), 0x7fffffff),
           (LLGT bdxaddr20only:$src)>;
 
 // Load and zero rightmost byte.
 let Predicates = [FeatureLoadAndZeroRightmostByte] in {
   def LLZRGF : UnaryRXY<"llzrgf", 0xE33A, null_frag, GR64, 4>;
-  def : Pat<(and (i64 (azextloadi32 bdxaddr20only:$src)), 0xffffff00),
+  def : Pat<(and (i64 (z_azextloadi32 bdxaddr20only:$src)), 0xffffff00),
             (LLZRGF bdxaddr20only:$src)>;
 }
 
@@ -930,14 +930,14 @@ defm : SXU<ineg, LCGFR>;
 //===----------------------------------------------------------------------===//
 
 let isCodeGenOnly = 1 in
-  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, azextloadi8, 1>;
-defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, azextloadi8, 1>;
+  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, z_azextloadi8, 1>;
+defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, z_azextloadi8, 1>;
 
-defm : InsertMem<"inserti8", IC32,  GR32, azextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC32,  GR32, z_azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", IC32Y, GR32, z_azextloadi8, bdxaddr20pair>;
 
-defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC,  GR64, z_azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", ICY, GR64, z_azextloadi8, bdxaddr20pair>;
 
 // Insert characters under mask -- not (yet) used for codegen.
 let Defs = [CC] in {
@@ -1015,12 +1015,12 @@ let Defs = [CC], CCValues = 0xF, CCIfNoSignedWrap = 1 in {
   def AGFI : BinaryRIL<"agfi", 0xC28, z_sadd, GR64, imm64sx32>;
 
   // Addition of memory.
-  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, asextloadi16, 2>;
-  defm A   : BinaryRXPairAndPseudo<"a",  0x5A, 0xE35A, z_sadd, GR32, load, 4>;
-  def  AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, asextloadi16, 2>,
+  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, z_asextloadi16, 2>;
+  defm A   : BinaryRXPairAndPseudo<"a",  0x5A, 0xE35A, z_sadd, GR32, z_load, 4>;
+  def  AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, z_asextloadi16, 2>,
              Requires<[FeatureMiscellaneousExtensions2]>;
-  def  AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, asextloadi32, 4>;
-  defm AG  : BinaryRXYAndPseudo<"ag",  0xE308, z_sadd, GR64, load, 8>;
+  def  AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, z_asextloadi32, 4>;
+  defm AG  : BinaryRXYAndPseudo<"ag",  0xE308, z_sadd, GR64, z_load, 8>;
 
   // Addition to memory.
   def ASI  : BinarySIY<"asi",  0xEB6A, add, imm32sx8>;
@@ -1058,9 +1058,9 @@ let Defs = [CC], CCValues = 0xF, IsLogical = 1 in {
               Requires<[FeatureHighWord]>;
 
   // Addition of memory.
-  defm AL   : BinaryRXPairAndPseudo<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>;
-  def  ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, azextloadi32, 4>;
-  defm ALG  : BinaryRXYAndPseudo<"alg",  0xE30A, z_uadd, GR64, load, 8>;
+  defm AL   : BinaryRXPairAndPseudo<"al", 0x5E, 0xE35E, z_uadd, GR32, z_load, 4>;
+  def  ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, z_azextloadi32, 4>;
+  defm ALG  : BinaryRXYAndPseudo<"alg",  0xE30A, z_uadd, GR64, z_load, 8>;
 
   // Addition to memory.
   def ALSI  : BinarySIY<"alsi",  0xEB6E, null_frag, imm32sx8>;
@@ -1075,8 +1075,8 @@ let Defs = [CC], Uses = [CC], CCValues = 0xF, IsLogical = 1 in {
   def ALCGR : BinaryRRE<"alcgr", 0xB988, z_addcarry, GR64, GR64>;
 
   // Addition of memory.
-  def ALC  : BinaryRXY<"alc",  0xE398, z_addcarry, GR32, load, 4>;
-  def ALCG : BinaryRXY<"alcg", 0xE388, z_addcarry, GR64, load, 8>;
+  def ALC  : BinaryRXY<"alc",  0xE398, z_addcarry, GR32, z_load, 4>;
+  def ALCG : BinaryRXY<"alcg", 0xE388, z_addcarry, GR64, z_load, 8>;
 }
 
 // Addition that does not modify the condition code.
@@ -1103,12 +1103,12 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8,
               Requires<[FeatureHighWord]>;
 
   // Subtraction of memory.
-  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, asextloadi16, 2>;
-  defm S   : BinaryRXPairAndPseudo<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>;
-  def  SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, asextloadi16, 2>,
+  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, z_asextloadi16, 2>;
+  defm S   : BinaryRXPairAndPseudo<"s", 0x5B, 0xE35B, z_ssub, GR32, z_load, 4>;
+  def  SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, z_asextloadi16, 2>,
              Requires<[FeatureMiscellaneousExtensions2]>;
-  def  SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, asextloadi32, 4>;
-  defm SG  : BinaryRXYAndPseudo<"sg",  0xE309, z_ssub, GR64, load, 8>;
+  def  SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, z_asextloadi32, 4>;
+  defm SG  : BinaryRXYAndPseudo<"sg",  0xE309, z_ssub, GR64, z_load, 8>;
 }
 defm : SXB<z_ssub, GR64, SGFR>;
 
@@ -1156,9 +1156,9 @@ let Defs = [CC], CCValues = 0x7, IsLogical = 1 in {
   def SLGFI : BinaryRIL<"slgfi", 0xC24, z_usub, GR64, imm64zx32>;
 
   // Subtraction of memory.
-  defm SL   : BinaryRXPairAndPseudo<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>;
-  def  SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, azextloadi32, 4>;
-  defm SLG  : BinaryRXYAndPseudo<"slg",  0xE30B, z_usub, GR64, load, 8>;
+  defm SL   : BinaryRXPairAndPseudo<"sl", 0x5F, 0xE35F, z_usub, GR32, z_load, 4>;
+  def  SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, z_azextloadi32, 4>;
+  defm SLG  : BinaryRXYAndPseudo<"slg",  0xE30B, z_usub, GR64, z_load, 8>;
 }
 defm : ZXB<z_usub, GR64, SLGFR>;
 
@@ -1183,8 +1183,8 @@ let Defs = [CC], Uses = [CC], CCValues = 0xF, IsLogical = 1 in {
   def SLBGR : BinaryRRE<"slbgr", 0xB989, z_subcarry, GR64, GR64>;
 
   // Subtraction of memory.
-  def SLB  : BinaryRXY<"slb",  0xE399, z_subcarry, GR32, load, 4>;
-  def SLBG : BinaryRXY<"slbg", 0xE389, z_subcarry, GR64, load, 8>;
+  def SLB  : BinaryRXY<"slb",  0xE399, z_subcarry, GR32, z_load, 4>;
+  def SLBG : BinaryRXY<"slbg", 0xE389, z_subcarry, GR64, z_load, 8>;
 }
 
 
@@ -1233,8 +1233,8 @@ let Defs = [CC] in {
 
   // ANDs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm N  : BinaryRXPairAndPseudo<"n", 0x54, 0xE354, and, GR32, load, 4>;
-    defm NG : BinaryRXYAndPseudo<"ng", 0xE380, and, GR64, load, 8>;
+    defm N  : BinaryRXPairAndPseudo<"n", 0x54, 0xE354, and, GR32, z_load, 4>;
+    defm NG : BinaryRXYAndPseudo<"ng", 0xE380, and, GR64, z_load, 8>;
   }
 
   // AND to memory
@@ -1290,8 +1290,8 @@ let Defs = [CC] in {
 
   // ORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm O  : BinaryRXPairAndPseudo<"o", 0x56, 0xE356, or, GR32, load, 4>;
-    defm OG : BinaryRXYAndPseudo<"og", 0xE381, or, GR64, load, 8>;
+    defm O  : BinaryRXPairAndPseudo<"o", 0x56, 0xE356, or, GR32, z_load, 4>;
+    defm OG : BinaryRXYAndPseudo<"og", 0xE381, or, GR64, z_load, 8>;
   }
 
   // OR to memory
@@ -1330,8 +1330,8 @@ let Defs = [CC] in {
 
   // XORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm X  : BinaryRXPairAndPseudo<"x",0x57, 0xE357, xor, GR32, load, 4>;
-    defm XG : BinaryRXYAndPseudo<"xg", 0xE382, xor, GR64, load, 8>;
+    defm X  : BinaryRXPairAndPseudo<"x",0x57, 0xE357, xor, GR32, z_load, 4>;
+    defm XG : BinaryRXYAndPseudo<"xg", 0xE382, xor, GR64, z_load, 8>;
   }
 
   // XOR to memory
@@ -1411,17 +1411,17 @@ def MSFI  : BinaryRIL<"msfi",  0xC21, mul, GR32, simm32>;
 def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
 
 // Multiplication of memory.
-defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
-defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
-def  MGH  : BinaryRXY<"mgh", 0xE33C, mul, GR64, asextloadi16, 2>,
+defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, z_asextloadi16, 2>;
+defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, z_load, 4>;
+def  MGH  : BinaryRXY<"mgh", 0xE33C, mul, GR64, z_asextloadi16, 2>,
             Requires<[FeatureMiscellaneousExtensions2]>;
-def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
-def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
+def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, z_asextloadi32, 4>;
+def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, z_load, 8>;
 
 // Multiplication of memory, setting the condition code.
 let Predicates = [FeatureMiscellaneousExtensions2], Defs = [CC] in {
-  defm MSC  : BinaryRXYAndPseudo<"msc",  0xE353, null_frag, GR32, load, 4>;
-  defm MSGC : BinaryRXYAndPseudo<"msgc", 0xE383, null_frag, GR64, load, 8>;
+  defm MSC  : BinaryRXYAndPseudo<"msc",  0xE353, null_frag, GR32, z_load, 4>;
+  defm MSGC : BinaryRXYAndPseudo<"msgc", 0xE383, null_frag, GR64, z_load, 8>;
 }
 
 // Multiplication of a register, producing two results.
@@ -1437,16 +1437,16 @@ def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2),
           (MLGR (AEXT128 GR64:$src1), GR64:$src2)>;
 
 // Multiplication of memory, producing two results.
-def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, load, 4>;
-def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
-def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, load, 8>,
+def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, z_load, 4>;
+def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, z_load, 4>;
+def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, z_load, 8>,
           Requires<[FeatureMiscellaneousExtensions2]>;
-def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, load, 4>;
-def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>;
+def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, z_load, 4>;
+def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, z_load, 8>;
 
-def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_smul_lohi GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
-def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_umul_lohi GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (MLG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 //===----------------------------------------------------------------------===//
@@ -1462,30 +1462,30 @@ let hasSideEffects = 1 in {  // Do not speculatively execute.
   def DLGR  : BinaryRRE<"dlgr",  0xB987, null_frag, GR128, GR64>;
 
   // Division and remainder, from memory.
-  def D    : BinaryRX <"d",    0x5D,   null_frag, GR128, load, 4>;
-  def DSGF : BinaryRXY<"dsgf", 0xE31D, null_frag, GR128, load, 4>;
-  def DSG  : BinaryRXY<"dsg",  0xE30D, null_frag, GR128, load, 8>;
-  def DL   : BinaryRXY<"dl",   0xE397, null_frag, GR128, load, 4>;
-  def DLG  : BinaryRXY<"dlg",  0xE387, null_frag, GR128, load, 8>;
+  def D    : BinaryRX <"d",    0x5D,   null_frag, GR128, z_load, 4>;
+  def DSGF : BinaryRXY<"dsgf", 0xE31D, null_frag, GR128, z_load, 4>;
+  def DSG  : BinaryRXY<"dsg",  0xE30D, null_frag, GR128, z_load, 8>;
+  def DL   : BinaryRXY<"dl",   0xE397, null_frag, GR128, z_load, 4>;
+  def DLG  : BinaryRXY<"dlg",  0xE387, null_frag, GR128, z_load, 8>;
 }
 def : Pat<(z_sdivrem GR64:$src1, GR32:$src2),
           (DSGFR (AEXT128 GR64:$src1), GR32:$src2)>;
-def : Pat<(z_sdivrem GR64:$src1, (i32 (load bdxaddr20only:$src2))),
+def : Pat<(z_sdivrem GR64:$src1, (i32 (z_load bdxaddr20only:$src2))),
           (DSGF (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 def : Pat<(z_sdivrem GR64:$src1, GR64:$src2),
           (DSGR (AEXT128 GR64:$src1), GR64:$src2)>;
-def : Pat<(z_sdivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_sdivrem GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (DSG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 def : Pat<(z_udivrem GR32:$src1, GR32:$src2),
           (DLR (ZEXT128 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src1,
                                        subreg_l32)), GR32:$src2)>;
-def : Pat<(z_udivrem GR32:$src1, (i32 (load bdxaddr20only:$src2))),
+def : Pat<(z_udivrem GR32:$src1, (i32 (z_load bdxaddr20only:$src2))),
           (DL (ZEXT128 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src1,
                                       subreg_l32)), bdxaddr20only:$src2)>;
 def : Pat<(z_udivrem GR64:$src1, GR64:$src2),
           (DLGR (ZEXT128 GR64:$src1), GR64:$src2)>;
-def : Pat<(z_udivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+def : Pat<(z_udivrem GR64:$src1, (i64 (z_load bdxaddr20only:$src2))),
           (DLG (ZEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 //===----------------------------------------------------------------------===//
@@ -1591,25 +1591,25 @@ let Defs = [CC], CCValues = 0xE in {
   def CGFI : CompareRIL<"cgfi", 0xC2C, z_scmp, GR64, imm64sx32>;
 
   // Comparison with memory.
-  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, asextloadi16, 2>;
-  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, load, 4>,
+  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, z_asextloadi16, 2>;
+  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, z_load, 4>,
                Requires<[FeatureHighWord]>;
-  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, load, 4>;
-  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, load, 4>,
+  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, z_load, 4>;
+  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, z_load, 4>,
                Requires<[FeatureHighWord]>;
-  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, asextloadi16, 2>;
-  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, asextloadi32, 4>;
-  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, load, 8>;
-  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_asextloadi16>;
-  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_load>;
-  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_asextloadi16>;
-  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_asextloadi32>;
-  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_load>;
+  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, z_asextloadi16, 2>;
+  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, z_asextloadi32, 4>;
+  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, z_load, 8>;
+  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_z_asextloadi16>;
+  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_z_load>;
+  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_z_asextloadi16>;
+  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_z_asextloadi32>;
+  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_z_load>;
 
   // Comparison between memory and a signed 16-bit immediate.
-  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, asextloadi16, imm32sx16>;
-  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, load, imm32sx16>;
-  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, load, imm64sx16>;
+  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, z_asextloadi16, imm32sx16>;
+  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, z_load, imm32sx16>;
+  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, z_load, imm64sx16>;
 }
 defm : SXB<z_scmp, GR64, CGFR>;
 
@@ -1636,31 +1636,31 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
   def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
 
   // Comparison with memory.
-  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, load, 4>,
+  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, z_load, 4>,
                 Requires<[FeatureHighWord]>;
-  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load, 4>;
-  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, load, 4>,
+  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, z_load, 4>;
+  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, z_load, 4>,
                 Requires<[FeatureHighWord]>;
-  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, azextloadi32, 4>;
-  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, load, 8>;
+  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, z_azextloadi32, 4>;
+  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, z_load, 8>;
   def  CLHRL  : CompareRILPC<"clhrl",  0xC67, z_ucmp, GR32,
-                             aligned_azextloadi16>;
+                             aligned_z_azextloadi16>;
   def  CLRL   : CompareRILPC<"clrl",   0xC6F, z_ucmp, GR32,
-                             aligned_load>;
+                             aligned_z_load>;
   def  CLGHRL : CompareRILPC<"clghrl", 0xC66, z_ucmp, GR64,
-                             aligned_azextloadi16>;
+                             aligned_z_azextloadi16>;
   def  CLGFRL : CompareRILPC<"clgfrl", 0xC6E, z_ucmp, GR64,
-                             aligned_azextloadi32>;
+                             aligned_z_azextloadi32>;
   def  CLGRL  : CompareRILPC<"clgrl",  0xC6A, z_ucmp, GR64,
-                             aligned_load>;
+                             aligned_z_load>;
 
   // Comparison between memory and an unsigned 8-bit immediate.
-  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, azextloadi8, imm32zx8>;
+  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, z_azextloadi8, imm32zx8>;
 
   // Comparison between memory and an unsigned 16-bit immediate.
-  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, azextloadi16, imm32zx16>;
-  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load, imm32zx16>;
-  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load, imm64zx16>;
+  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, z_azextloadi16, imm32zx16>;
+  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, z_load, imm32zx16>;
+  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, z_load, imm64zx16>;
 }
 defm : ZXB<z_ucmp, GR64, CLGFR>;
 
@@ -1693,7 +1693,7 @@ let Defs = [CC] in {
   def TMHL64 : CompareAliasRI<z_tm_reg, GR64, imm64hl16>;
   def TMHH64 : CompareAliasRI<z_tm_reg, GR64, imm64hh16>;
 
-  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, anyextloadi8, imm32zx8>;
+  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, z_anyextloadi8, imm32zx8>;
 }
 
 def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
@@ -1914,8 +1914,8 @@ let Predicates = [FeatureGuardedStorage], hasSideEffects = 1 in {
 // Decimal arithmetic
 //===----------------------------------------------------------------------===//
 
-defm CVB  : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, load, 4>;
-def  CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, load, 8>;
+defm CVB  : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, z_load, 4>;
+def  CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, z_load, 8>;
 
 defm CVD  : StoreRXPair<"cvd", 0x4E, 0xE326, null_frag, GR32, 4>;
 def  CVDG : StoreRXY<"cvdg", 0xE32E, null_frag, GR64, 8>;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 799b27d74414d5..245e3c3399a986 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -140,8 +140,8 @@ let Predicates = [FeatureVector] in {
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
   let mayLoad = 1 in {
-    def VL32 : UnaryAliasVRX<load, v32sb, bdxaddr12pair>;
-    def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
+    def VL32 : UnaryAliasVRX<z_load, v32sb, bdxaddr12pair>;
+    def VL64 : UnaryAliasVRX<z_load, v64db, bdxaddr12pair>;
   }
 
   // Load logical element and zero.
@@ -198,12 +198,12 @@ multiclass ReplicatePeephole<Instruction vlrep, ValueType vectype,
                       (scalartype (load bdxaddr12only:$addr)))),
             (vlrep bdxaddr12only:$addr)>;
 }
-defm : ReplicatePeephole<VLREPB, v16i8, anyextloadi8, i32>;
-defm : ReplicatePeephole<VLREPH, v8i16, anyextloadi16, i32>;
-defm : ReplicatePeephole<VLREPF, v4i32, load, i32>;
-defm : ReplicatePeephole<VLREPG, v2i64, load, i64>;
-defm : ReplicatePeephole<VLREPF, v4f32, load, f32>;
-defm : ReplicatePeephole<VLREPG, v2f64, load, f64>;
+defm : ReplicatePeephole<VLREPB, v16i8, z_anyextloadi8, i32>;
+defm : ReplicatePeephole<VLREPH, v8i16, z_anyextloadi16, i32>;
+defm : ReplicatePeephole<VLREPF, v4i32, z_load, i32>;
+defm : ReplicatePeephole<VLREPG, v2i64, z_load, i64>;
+defm : ReplicatePeephole<VLREPF, v4f32, z_load, f32>;
+defm : ReplicatePeephole<VLREPG, v2f64, z_load, f64>;
 
 //===----------------------------------------------------------------------===//
 // Stores
@@ -1561,13 +1561,13 @@ let Predicates = [FeatureVector] in {
 
 // Any-extending loads into i128.
 let Predicates = [FeatureVector] in {
-  def : Pat<(i128 (extloadi8 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi8 bdxaddr12only:$addr)),
             (VLREPB bdxaddr12only:$addr)>;
-  def : Pat<(i128 (extloadi16 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi16 bdxaddr12only:$addr)),
             (VLREPH bdxaddr12only:$addr)>;
-  def : Pat<(i128 (extloadi32 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi32 bdxaddr12only:$addr)),
             (VLREPF bdxaddr12only:$addr)>;
-  def : Pat<(i128 (extloadi64 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_extloadi64 bdxaddr12only:$addr)),
             (VLREPG bdxaddr12only:$addr)>;
 }
 
@@ -1621,13 +1621,13 @@ let Predicates = [FeatureVector] in {
 
 // Zero-extending loads into i128.
 let Predicates = [FeatureVector] in {
-  def : Pat<(i128 (zextloadi8 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi8 bdxaddr12only:$addr)),
             (VLEB (VGBM 0), bdxaddr12only:$addr, 15)>;
-  def : Pat<(i128 (zextloadi16 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi16 bdxaddr12only:$addr)),
             (VLEH (VGBM 0), bdxaddr12only:$addr, 7)>;
-  def : Pat<(i128 (zextloadi32 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi32 bdxaddr12only:$addr)),
             (VLEF (VGBM 0), bdxaddr12only:$addr, 3)>;
-  def : Pat<(i128 (zextloadi64 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_zextloadi64 bdxaddr12only:$addr)),
             (VLEG (VGBM 0), bdxaddr12only:$addr, 1)>;
 }
 
@@ -1663,13 +1663,13 @@ let Predicates = [FeatureVector] in {
 
 // Sign-extending loads into i128.
 let Predicates = [FeatureVector] in {
-  def : Pat<(i128 (sextloadi8 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi8 bdxaddr12only:$addr)),
             (VSRAB (VLREPB bdxaddr12only:$addr), (VREPIB 120))>;
-  def : Pat<(i128 (sextloadi16 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi16 bdxaddr12only:$addr)),
             (VSRAB (VLREPH bdxaddr12only:$addr), (VREPIB 112))>;
-  def : Pat<(i128 (sextloadi32 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi32 bdxaddr12only:$addr)),
             (VSRAB (VLREPF bdxaddr12only:$addr), (VREPIB 96))>;
-  def : Pat<(i128 (sextloadi64 bdxaddr12only:$addr)),
+  def : Pat<(i128 (z_sextloadi64 bdxaddr12only:$addr)),
             (VSRAB (VLREPG bdxaddr12only:$addr), (VREPIB 64))>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index d98bb886c18506..fa292b1b01cd1e 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -534,37 +534,108 @@ def zext8  : PatFrag<(ops node:$src), (and node:$src, 0xff)>;
 def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>;
 def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
 
-// Extending loads in which the extension type can be signed.
-def asextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
-  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
-  return Type == ISD::EXTLOAD || Type == ISD::SEXTLOAD;
+// Match a load or a non-extending atomic load.
+def z_load : PatFrags<(ops node:$ptr),
+                      [(load node:$ptr),
+                       (atomic_load node:$ptr)], [{
+  if (auto *AL = dyn_cast<AtomicSDNode>(N))  // XXXX getLoadExtType?
+    if (AL->getExtensionType() != ISD::NON_EXTLOAD)
+      return false;
+  return true;
 }]>;
-def asextloadi8 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+
+// Sign extending (atomic) loads.
+def z_sextload : PatFrags<(ops node:$ptr),
+                          [(unindexedload node:$ptr),
+                           (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) == ISD::SEXTLOAD;
 }]>;
-def asextloadi16 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+def z_sextloadi8 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
-def asextloadi32 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+def z_sextloadi16 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_sextloadi32 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_sextloadi64 : PatFrag<(ops node:$ptr), (z_sextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
 }]>;
 
-// Extending loads in which the extension type can be unsigned.
-def azextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
-  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
-  return Type == ISD::EXTLOAD || Type == ISD::ZEXTLOAD;
+// Zero extending (atomic) loads.
+def z_zextload : PatFrags<(ops node:$ptr),
+                          [(unindexedload node:$ptr),
+                           (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) == ISD::ZEXTLOAD;
 }]>;
-def azextloadi8 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+def z_zextloadi8 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
-def azextloadi16 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+def z_zextloadi16 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
 }]>;
-def azextloadi32 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
-  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+def z_zextloadi32 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_zextloadi64 : PatFrag<(ops node:$ptr), (z_zextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+// Extending (atomic) loads in which the extension type can be signed.
+def z_asextload : PatFrags<(ops node:$ptr),
+                           [(unindexedload node:$ptr),
+                            (atomic_load node:$ptr)], [{
+  ISD::LoadExtType ETy = getLoadExtType(N);
+  return ETy == ISD::EXTLOAD || ETy == ISD::SEXTLOAD;
+}]>;
+def z_asextloadi8 : PatFrag<(ops node:$ptr), (z_asextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_asextloadi16 : PatFrag<(ops node:$ptr), (z_asextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_asextloadi32 : PatFrag<(ops node:$ptr), (z_asextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
-// Extending loads in which the extension type doesn't matter.
+// Extending (atomic) loads in which the extension type can be unsigned.
+def z_azextload : PatFrags<(ops node:$ptr),
+                           [(unindexedload node:$ptr),
+                           (atomic_load node:$ptr)], [{
+  ISD::LoadExtType ETy = getLoadExtType(N);
+  return ETy == ISD::EXTLOAD || ETy == ISD::ZEXTLOAD;
+}]>;
+def z_azextloadi8 : PatFrag<(ops node:$ptr), (z_azextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_azextloadi16 : PatFrag<(ops node:$ptr), (z_azextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_azextloadi32 : PatFrag<(ops node:$ptr), (z_azextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending (atomic) loads in which the extension type doesn't matter.
+def z_anyextload : PatFrags<(ops node:$ptr),
+                            [(unindexedload node:$ptr),
+                             (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) != ISD::NON_EXTLOAD;
+}]>;
+def z_anyextloadi8 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_anyextloadi16 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_anyextloadi32 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_anyextloadi64 : PatFrag<(ops node:$ptr), (z_anyextload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+// Extending non-atomic loads in which the extension type doesn't matter.
 def anyextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
   return cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD;
 }]>;
@@ -578,15 +649,42 @@ def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
+// Extending (atomic) loads that are not sign/zero extending.
+def z_extload : PatFrags<(ops node:$ptr),
+                         [(extload node:$ptr),
+                          (atomic_load node:$ptr)], [{
+  return getLoadExtType(N) == ISD::EXTLOAD;
+}]>;
+def z_extloadi8 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def z_extloadi16 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_extloadi32 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_extloadi64 : PatFrag<(ops node:$ptr), (z_extload node:$ptr), [{
+  return cast<MemSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+// Extending atomic FP loads.
+def z_any_extloadf32 : PatFrags<(ops node:$ptr),
+                                [(any_extloadf32 node:$ptr),
+                                 (any_fpextend (f32 (atomic_load node:$ptr)))]>;
+def z_any_extloadf64 : PatFrags<(ops node:$ptr),
+                                [(any_extloadf64 node:$ptr),
+                                 (any_fpextend (f64 (atomic_load node:$ptr)))]>;
+
 // Aligned loads.
 class AlignedLoad<SDPatternOperator load>
   : PatFrag<(ops node:$addr), (load node:$addr),
   [{ return storeLoadIsAligned(N); }]>;
-def aligned_load         : AlignedLoad<load>;
-def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
-def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
-def aligned_azextloadi16 : AlignedLoad<azextloadi16>;
-def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
+def aligned_z_load       : AlignedLoad<z_load>;
+def aligned_z_asextloadi16 : AlignedLoad<z_asextloadi16>;
+def aligned_z_asextloadi32 : AlignedLoad<z_asextloadi32>;
+def aligned_z_azextloadi16 : AlignedLoad<z_azextloadi16>;
+def aligned_z_azextloadi32 : AlignedLoad<z_azextloadi32>;
 
 // Aligned stores.
 class AlignedStore<SDPatternOperator store>
@@ -749,7 +847,7 @@ def z_any_vround  : PatFrags<(ops node:$src),
 
 // Create a unary operator that loads from memory and then performs
 // the given operation on it.
-class loadu<SDPatternOperator operator, SDPatternOperator load = load>
+class loadu<SDPatternOperator operator, SDPatternOperator load = z_load>
   : PatFrag<(ops node:$addr), (operator (load node:$addr))>;
 
 // Create a store operator that performs the given unary operation
@@ -799,12 +897,12 @@ def imm32nobytes : PatLeaf<(i32 imm), [{
 class z_replicate_load<ValueType scalartype, SDPatternOperator load>
   : PatFrag<(ops node:$addr),
             (z_replicate (scalartype (load node:$addr)))>;
-def z_replicate_loadi8  : z_replicate_load<i32, anyextloadi8>;
-def z_replicate_loadi16 : z_replicate_load<i32, anyextloadi16>;
-def z_replicate_loadi32 : z_replicate_load<i32, load>;
-def z_replicate_loadi64 : z_replicate_load<i64, load>;
-def z_replicate_loadf32 : z_replicate_load<f32, load>;
-def z_replicate_loadf64 : z_replicate_load<f64, load>;
+def z_replicate_loadi8  : z_replicate_load<i32, z_anyextloadi8>;
+def z_replicate_loadi16 : z_replicate_load<i32, z_anyextloadi16>;
+def z_replicate_loadi32 : z_replicate_load<i32, z_load>;
+def z_replicate_loadi64 : z_replicate_load<i64, z_load>;
+def z_replicate_loadf32 : z_replicate_load<f32, z_load>;
+def z_replicate_loadf64 : z_replicate_load<f64, z_load>;
 // Byte-swapped replicated vector element loads.
 def z_replicate_loadbswapi16 : z_replicate_load<i32, z_loadbswap16>;
 def z_replicate_loadbswapi32 : z_replicate_load<i32, z_loadbswap32>;
@@ -815,12 +913,12 @@ class z_vle<ValueType scalartype, SDPatternOperator load>
   : PatFrag<(ops node:$vec, node:$addr, node:$index),
             (z_vector_insert node:$vec, (scalartype (load node:$addr)),
                              node:$index)>;
-def z_vlei8  : z_vle<i32, anyextloadi8>;
-def z_vlei16 : z_vle<i32, anyextloadi16>;
-def z_vlei32 : z_vle<i32, load>;
-def z_vlei64 : z_vle<i64, load>;
-def z_vlef32 : z_vle<f32, load>;
-def z_vlef64 : z_vle<f64, load>;
+def z_vlei8  : z_vle<i32, z_anyextloadi8>;
+def z_vlei16 : z_vle<i32, z_anyextloadi16>;
+def z_vlei32 : z_vle<i32, z_load>;
+def z_vlei64 : z_vle<i64, z_load>;
+def z_vlef32 : z_vle<f32, z_load>;
+def z_vlef64 : z_vle<f64, z_load>;
 // Byte-swapped vector element loads.
 def z_vlebri16 : z_vle<i32, z_loadbswap16>;
 def z_vlebri32 : z_vle<i32, z_loadbswap32>;
@@ -832,13 +930,13 @@ class z_vllez<ValueType scalartype, SDPatternOperator load, int index>
   : PatFrag<(ops node:$addr),
             (z_vector_insert immAllZerosV,
                              (scalartype (load node:$addr)), (i32 index))>;
-def z_vllezi8  : z_vllez<i32, anyextloadi8, 7>;
-def z_vllezi16 : z_vllez<i32, anyextloadi16, 3>;
-def z_vllezi32 : z_vllez<i32, load, 1>;
+def z_vllezi8  : z_vllez<i32, z_anyextloadi8, 7>;
+def z_vllezi16 : z_vllez<i32, z_anyextloadi16, 3>;
+def z_vllezi32 : z_vllez<i32, z_load, 1>;
 def z_vllezi64 : PatFrags<(ops node:$addr),
                           [(z_vector_insert immAllZerosV,
-                                            (i64 (load node:$addr)), (i32 0)),
-                           (z_join_dwords (i64 (load node:$addr)), (i64 0))]>;
+                                            (i64 (z_load node:$addr)), (i32 0)),
+                           (z_join_dwords (i64 (z_load node:$addr)), (i64 0))]>;
 // We use high merges to form a v4f32 from four f32s.  Propagating zero
 // into all elements but index 1 gives this expression.
 def z_vllezf32 : PatFrag<(ops node:$addr),
@@ -848,23 +946,23 @@ def z_vllezf32 : PatFrag<(ops node:$addr),
                             (v4i32
                              (bitconvert
                               (v4f32 (scalar_to_vector
-                                      (f32 (load node:$addr)))))))),
+                                      (f32 (z_load node:$addr)))))))),
                           (v2i64
                            (bitconvert (v4f32 immAllZerosV))))>;
 def z_vllezf64 : PatFrag<(ops node:$addr),
                          (z_merge_high
-                          (v2f64 (scalar_to_vector (f64 (load node:$addr)))),
+                          (v2f64 (scalar_to_vector (f64 (z_load node:$addr)))),
                           immAllZerosV)>;
 
 // Similarly for the high element of a zeroed vector.
-def z_vllezli32 : z_vllez<i32, load, 0>;
+def z_vllezli32 : z_vllez<i32, z_load, 0>;
 def z_vllezlf32 : PatFrag<(ops node:$addr),
                           (z_merge_high
                            (v2i64
                             (bitconvert
                              (z_merge_high
                               (v4f32 (scalar_to_vector
-                                      (f32 (load node:$addr)))),
+                                      (f32 (z_load node:$addr)))),
                               (v4f32 immAllZerosV)))),
                            (v2i64
                             (bitconvert (v4f32 immAllZerosV))))>;
diff --git a/llvm/lib/Target/SystemZ/SystemZPatterns.td b/llvm/lib/Target/SystemZ/SystemZPatterns.td
index 5e5dca77e9553b..4d6bc68e9a7edd 100644
--- a/llvm/lib/Target/SystemZ/SystemZPatterns.td
+++ b/llvm/lib/Target/SystemZ/SystemZPatterns.td
@@ -49,8 +49,8 @@ class RMWI<SDPatternOperator load, SDPatternOperator operator,
 // memory location.  IMM is the type of the second operand.
 multiclass RMWIByte<SDPatternOperator operator, AddressingMode mode,
                     Instruction insn> {
-  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
-  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
+  def : RMWI<z_anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
+  def : RMWI<z_anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
 }
 
 // Record that INSN performs insertion TYPE into a register of class CLS.
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index 1e548d7c101a7a..3bb4636e07b8d1 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -1789,10 +1789,10 @@ defm : TRUNC64m<truncstorei32, STLrri, STLrii, STLzri, ST1Bzii>;
 multiclass ATMLDm<SDPatternOperator from,
                   RM torri, RM torii,
                   RM tozri, RM tozii> {
-  def : Pat<(from ADDRrri:$addr), (torri MEMrri:$addr)>;
-  def : Pat<(from ADDRrii:$addr), (torii MEMrii:$addr)>;
-  def : Pat<(from ADDRzri:$addr), (tozri MEMzri:$addr)>;
-  def : Pat<(from ADDRzii:$addr), (tozii MEMzii:$addr)>;
+  def : Pat<(iAny (from ADDRrri:$addr)), (torri MEMrri:$addr)>;
+  def : Pat<(iAny (from ADDRrii:$addr)), (torii MEMrii:$addr)>;
+  def : Pat<(iAny (from ADDRzri:$addr)), (tozri MEMzri:$addr)>;
+  def : Pat<(iAny (from ADDRzii:$addr)), (tozii MEMzii:$addr)>;
 }
 defm : ATMLDm<atomic_load_8, LD1BZXrri, LD1BZXrii, LD1BZXzri, LD1BZXzii>;
 defm : ATMLDm<atomic_load_16, LD2BZXrri, LD2BZXrii, LD2BZXzri, LD2BZXzii>;
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-06.ll b/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
index d75f15a574f7ef..60ff780df87b0e 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
@@ -4,7 +4,7 @@
 
 define float @f1(ptr %src) {
 ; CHECK-LABEL: f1:
-; CHECK: le %f0
+; CHECK: le %f0, 0(%r2)
 ; CHECK: br %r14
   %val = load atomic float, ptr %src seq_cst, align 4
   ret float %val
diff --git a/llvm/test/CodeGen/SystemZ/atomic-memops-fp128.ll b/llvm/test/CodeGen/SystemZ/atomic-memops-fp128.ll
new file mode 100644
index 00000000000000..8038329c0e09a0
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-memops-fp128.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+;
+; Test fpext of atomic loads to fp128 without VectorEnhancements1 (using FP register pairs).
+
+define fp128 @f1(ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lxeb %f0, 0(%r3)
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    std %f2, 8(%r2)
+; CHECK-NEXT:    br %r14
+  %V  = load atomic float, ptr %src seq_cst, align 4
+  %Res = fpext float %V to fp128
+  ret fp128 %Res
+}
+
+define fp128 @f2(ptr %src) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lxdb %f0, 0(%r3)
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    std %f2, 8(%r2)
+; CHECK-NEXT:    br %r14
+  %V  = load atomic double, ptr %src seq_cst, align 8
+  %Res = fpext double %V to fp128
+  ret fp128 %Res
+}
+
+
+
diff --git a/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll b/llvm/test/CodeGen/SystemZ/atomic-memops.ll
similarity index 94%
rename from llvm/test/CodeGen/SystemZ/atomic-memofolds.ll
rename to llvm/test/CodeGen/SystemZ/atomic-memops.ll
index fa1578df04bec1..9d799630ae8f8f 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-memops.ll
@@ -182,6 +182,41 @@ define float @f15(float %f1, ptr %ptr, float %acc) {
 }
 declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
 
+define double @f15_b(ptr %src) {
+; CHECK-LABEL: f15_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ldeb %f0, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %V  = load atomic float, ptr %src seq_cst, align 4
+  %Res = fpext float %V to double
+  ret double %Res
+}
+
+define fp128 @f15_c(ptr %src) {
+; CHECK-LABEL: f15_c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lde %f0, 0(%r3)
+; CHECK-NEXT:    ldebr %f0, %f0
+; CHECK-NEXT:    wflld %v0, %f0
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+  %V  = load atomic float, ptr %src seq_cst, align 4
+  %Res = fpext float %V to fp128
+  ret fp128 %Res
+}
+
+define fp128 @f15_d(ptr %src) {
+; CHECK-LABEL: f15_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld %f0, 0(%r3)
+; CHECK-NEXT:    wflld %v0, %f0
+; CHECK-NEXT:    vst %v0, 0(%r2), 3
+; CHECK-NEXT:    br %r14
+  %V  = load atomic double, ptr %src seq_cst, align 8
+  %Res = fpext double %V to fp128
+  ret fp128 %Res
+}
+
 ; Do it twice for good measure given the involved DAG combines.
 define void @f16(ptr %src, ptr %dst) {
 ; CHECK-LABEL: f16:
@@ -299,10 +334,10 @@ define double @f20(ptr %src, double %a, double %b) {
 ; CHECK-LABEL: f20:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    tm 0(%r2), 1
-; CHECK-NEXT:    je .LBB22_2
+; CHECK-NEXT:    je .LBB25_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    ldr %f2, %f0
-; CHECK-NEXT:  .LBB22_2:
+; CHECK-NEXT:  .LBB25_2:
 ; CHECK-NEXT:    ldr %f0, %f2
 ; CHECK-NEXT:    br %r14
   %byte = load atomic i8, ptr %src seq_cst, align 1
@@ -641,7 +676,7 @@ define void @f43(ptr %ptr) {
 define void @f44(ptr %ptr) {
 ; CHECK-LABEL: f44:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    larl %r1, .LCPI50_0
+; CHECK-NEXT:    larl %r1, .LCPI53_0
 ; CHECK-NEXT:    ld %f0, 0(%r1)
 ; CHECK-NEXT:    std %f0, 0(%r2)
 ; CHECK-NEXT:    bcr 14, %r0
@@ -755,3 +790,14 @@ define void @f52(ptr %src, ptr %dst) {
   store atomic i64 %b1, ptr %dst seq_cst, align 8
   ret void
 }
+
+define void @fun58(ptr %ptr, i64 %arg) {
+; CHECK-LABEL: fun58:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    st %r3, 0(%r2)
+; CHECK-NEXT:    bcr 14, %r0
+; CHECK-NEXT:    br %r14
+  %res = trunc i64 %arg to i32
+  store atomic i32 %res, ptr %ptr seq_cst, align 4
+  ret void
+}