[llvm] [NVPTX] Further cleanup call isel (PR #146411)

Alex MacLean via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 30 13:21:37 PDT 2025


https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/146411

>From 7f46a186b30b4eab1f0b121f20d66cc32058332b Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Mon, 30 Jun 2025 16:30:22 +0000
Subject: [PATCH] [NVPTX] Further cleanup call isel

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |  303 +++--
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |   16 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |   58 +-
 llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll       | 1080 ++++++++---------
 llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll       | 1080 ++++++++---------
 llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll       | 1080 ++++++++---------
 llvm/test/CodeGen/NVPTX/cmpxchg.ll            |  240 ++--
 llvm/test/CodeGen/NVPTX/convert-int-sm20.ll   |    6 +-
 llvm/test/CodeGen/NVPTX/extractelement.ll     |   21 +-
 .../CodeGen/NVPTX/lower-args-gridconstant.ll  |    8 +-
 llvm/test/CodeGen/NVPTX/lower-args.ll         |    4 +-
 llvm/test/CodeGen/NVPTX/misched_func_call.ll  |   12 +-
 llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll      |   15 +-
 llvm/test/CodeGen/NVPTX/st-param-imm.ll       |   60 +-
 .../NVPTX/unaligned-param-load-store.ll       |  192 +--
 llvm/test/CodeGen/NVPTX/variadics-backend.ll  |    2 +-
 16 files changed, 2087 insertions(+), 2090 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d817a3c6a8777..bb0aeb493ed48 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -390,35 +391,27 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
 /// and promote them to a larger size if they're not.
 ///
 /// The promoted type is placed in \p PromoteVT if the function returns true.
-static std::optional<MVT> PromoteScalarIntegerPTX(const EVT &VT) {
+static EVT promoteScalarIntegerPTX(const EVT VT) {
   if (VT.isScalarInteger()) {
-    MVT PromotedVT;
     switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
     default:
       llvm_unreachable(
           "Promotion is not suitable for scalars of size larger than 64-bits");
     case 1:
-      PromotedVT = MVT::i1;
-      break;
+      return MVT::i1;
     case 2:
     case 4:
     case 8:
-      PromotedVT = MVT::i8;
-      break;
+      return MVT::i8;
     case 16:
-      PromotedVT = MVT::i16;
-      break;
+      return MVT::i16;
     case 32:
-      PromotedVT = MVT::i32;
-      break;
+      return MVT::i32;
     case 64:
-      PromotedVT = MVT::i64;
-      break;
+      return MVT::i64;
     }
-    if (VT != PromotedVT)
-      return PromotedVT;
   }
-  return std::nullopt;
+  return VT;
 }
 
 // Check whether we can merge loads/stores of some of the pieces of a
@@ -1053,10 +1046,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     break;
 
     MAKE_CASE(NVPTXISD::RET_GLUE)
-    MAKE_CASE(NVPTXISD::DeclareParam)
+    MAKE_CASE(NVPTXISD::DeclareArrayParam)
     MAKE_CASE(NVPTXISD::DeclareScalarParam)
-    MAKE_CASE(NVPTXISD::DeclareRet)
-    MAKE_CASE(NVPTXISD::DeclareRetParam)
     MAKE_CASE(NVPTXISD::CALL)
     MAKE_CASE(NVPTXISD::LoadParam)
     MAKE_CASE(NVPTXISD::LoadParamV2)
@@ -1162,8 +1153,8 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
 }
 
 std::string NVPTXTargetLowering::getPrototype(
-    const DataLayout &DL, Type *retTy, const ArgListTy &Args,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign RetAlign,
+    const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
     std::optional<unsigned> FirstVAArg, const CallBase &CB,
     unsigned UniqueCallSite) const {
   auto PtrVT = getPointerTy(DL);
@@ -1172,22 +1163,22 @@ std::string NVPTXTargetLowering::getPrototype(
   raw_string_ostream O(Prototype);
   O << "prototype_" << UniqueCallSite << " : .callprototype ";
 
-  if (retTy->isVoidTy()) {
+  if (RetTy->isVoidTy()) {
     O << "()";
   } else {
     O << "(";
-    if (shouldPassAsArray(retTy)) {
-      assert(RetAlign && "RetAlign must be set for non-void return types");
-      O << ".param .align " << RetAlign->value() << " .b8 _["
-        << DL.getTypeAllocSize(retTy) << "]";
-    } else if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
+    if (shouldPassAsArray(RetTy)) {
+      const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
+      O << ".param .align " << RetAlign.value() << " .b8 _["
+        << DL.getTypeAllocSize(RetTy) << "]";
+    } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
       unsigned size = 0;
-      if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
+      if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
         size = ITy->getBitWidth();
       } else {
-        assert(retTy->isFloatingPointTy() &&
+        assert(RetTy->isFloatingPointTy() &&
                "Floating point type expected here");
-        size = retTy->getPrimitiveSizeInBits();
+        size = RetTy->getPrimitiveSizeInBits();
       }
       // PTX ABI requires all scalar return values to be at least 32
       // bits in size.  fp16 normally uses .b16 as its storage type in
@@ -1195,7 +1186,7 @@ std::string NVPTXTargetLowering::getPrototype(
       size = promoteScalarArgumentSize(size);
 
       O << ".param .b" << size << " _";
-    } else if (isa<PointerType>(retTy)) {
+    } else if (isa<PointerType>(RetTy)) {
       O << ".param .b" << PtrVT.getSizeInBits() << " _";
     } else {
       llvm_unreachable("Unknown return type");
@@ -1256,7 +1247,7 @@ std::string NVPTXTargetLowering::getPrototype(
 
   if (FirstVAArg)
     O << (first ? "" : ",") << " .param .align "
-      << STI.getMaxRequiredAlignment() << " .b8 _[]\n";
+      << STI.getMaxRequiredAlignment() << " .b8 _[]";
   O << ")";
   if (shouldEmitPTXNoReturn(&CB, *nvTM))
     O << " .noreturn";
@@ -1442,6 +1433,21 @@ static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags) {
   return ISD::ANY_EXTEND;
 }
 
+static SDValue correctParamType(SDValue V, EVT ExpectedVT,
+                                ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                                SDLoc dl) {
+  const EVT ActualVT = V.getValueType();
+  assert((ActualVT == ExpectedVT ||
+          (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
+         "Non-integer argument type size mismatch");
+  if (ExpectedVT.bitsGT(ActualVT))
+    return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
+  if (ExpectedVT.bitsLT(ActualVT))
+    return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
+
+  return V;
+}
+
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        SmallVectorImpl<SDValue> &InVals) const {
 
@@ -1505,9 +1511,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
          "Outs and OutVals must be the same size");
   // Declare the .params or .reg need to pass values
   // to the function
-  for (const auto [ArgI, Arg] : llvm::enumerate(Args)) {
-    const auto ArgOuts = AllOuts.take_while(
-        [ArgI = ArgI](auto O) { return O.OrigArgIndex == ArgI; });
+  for (const auto E : llvm::enumerate(Args)) {
+    const auto ArgI = E.index();
+    const auto Arg = E.value();
+    const auto ArgOuts =
+        AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
     const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
     AllOuts = AllOuts.drop_front(ArgOuts.size());
     AllOutVals = AllOutVals.drop_front(ArgOuts.size());
@@ -1515,6 +1523,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const bool IsVAArg = (ArgI >= FirstVAArg);
     const bool IsByVal = Arg.IsByVal;
 
+    const SDValue ParamSymbol =
+        getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
+
     SmallVector<EVT, 16> VTs;
     SmallVector<uint64_t, 16> Offsets;
 
@@ -1525,38 +1536,43 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     assert(VTs.size() == Offsets.size() && "Size mismatch");
     assert((IsByVal || VTs.size() == ArgOuts.size()) && "Size mismatch");
 
-    Align ArgAlign;
-    if (IsByVal) {
-      // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
-      // so we don't need to worry whether it's naturally aligned or not.
-      // See TargetLowering::LowerCallTo().
-      Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
-      ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
-                                            InitialAlign, DL);
-      if (IsVAArg)
-        VAOffset = alignTo(VAOffset, ArgAlign);
-    } else {
-      ArgAlign = getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
-    }
+    const Align ArgAlign = [&]() {
+      if (IsByVal) {
+        // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
+        // so we don't need to worry whether it's naturally aligned or not.
+        // See TargetLowering::LowerCallTo().
+        const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
+        const Align ByValAlign = getFunctionByValParamAlign(
+            CB->getCalledFunction(), ETy, InitialAlign, DL);
+        if (IsVAArg)
+          VAOffset = alignTo(VAOffset, ByValAlign);
+        return ByValAlign;
+      }
+      return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
+    }();
 
     const unsigned TypeSize = DL.getTypeAllocSize(ETy);
     assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) &&
            "type size mismatch");
 
-    const bool PassAsArray = IsByVal || shouldPassAsArray(Arg.Ty);
-    if (IsVAArg) {
-      if (ArgI == FirstVAArg) {
-        VADeclareParam = Chain =
-            DAG.getNode(NVPTXISD::DeclareParam, dl, {MVT::Other, MVT::Glue},
-                        {Chain, GetI32(STI.getMaxRequiredAlignment()),
-                         GetI32(ArgI), GetI32(1), InGlue});
+    const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> {
+      if (IsVAArg) {
+        if (ArgI == FirstVAArg) {
+          VADeclareParam = DAG.getNode(
+              NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
+              {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()),
+               GetI32(0), InGlue});
+          return VADeclareParam;
+        }
+        return std::nullopt;
+      }
+      if (IsByVal || shouldPassAsArray(Arg.Ty)) {
+        // declare .param .align <align> .b8 .param<n>[<size>];
+        return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
+                           {MVT::Other, MVT::Glue},
+                           {Chain, ParamSymbol, GetI32(ArgAlign.value()),
+                            GetI32(TypeSize), InGlue});
       }
-    } else if (PassAsArray) {
-      // declare .param .align <align> .b8 .param<n>[<size>];
-      Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, {MVT::Other, MVT::Glue},
-                          {Chain, GetI32(ArgAlign.value()), GetI32(ArgI),
-                           GetI32(TypeSize), InGlue});
-    } else {
       assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
       // declare .param .b<size> .param<n>;
 
@@ -1568,11 +1584,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
               ? promoteScalarArgumentSize(TypeSize * 8)
               : TypeSize * 8;
 
-      Chain =
-          DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
-                      {Chain, GetI32(ArgI), GetI32(PromotedSize), InGlue});
+      return DAG.getNode(NVPTXISD::DeclareScalarParam, dl,
+                         {MVT::Other, MVT::Glue},
+                         {Chain, ParamSymbol, GetI32(PromotedSize), InGlue});
+    }();
+    if (ArgDeclare) {
+      Chain = ArgDeclare->getValue(0);
+      InGlue = ArgDeclare->getValue(1);
     }
-    InGlue = Chain.getValue(1);
 
     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
     // than 32-bits are sign extended or zero extended, depending on
@@ -1594,8 +1613,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       } else {
         StVal = ArgOutVals[I];
 
-        if (auto PromotedVT = PromoteScalarIntegerPTX(StVal.getValueType())) {
-          StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, *PromotedVT,
+        auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType());
+        if (PromotedVT != StVal.getValueType()) {
+          StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT,
                               StVal);
         }
       }
@@ -1619,12 +1639,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     unsigned J = 0;
     for (const unsigned NumElts : VectorInfo) {
       const int CurOffset = Offsets[J];
-      EVT EltVT = VTs[J];
+      EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
       const Align PartAlign = commonAlignment(ArgAlign, CurOffset);
 
-      if (auto PromotedVT = PromoteScalarIntegerPTX(EltVT))
-        EltVT = *PromotedVT;
-
       // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
       // scalar store. In such cases, fall back to byte stores.
       if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) {
@@ -1695,27 +1712,26 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
-  MaybeAlign RetAlign = std::nullopt;
 
   // Handle Result
   if (!Ins.empty()) {
-    RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
-
-    // Declare
-    //  .param .align N .b8 retval0[<size-in-bytes>], or
-    //  .param .b<size-in-bits> retval0
-    const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy);
-    if (!shouldPassAsArray(RetTy)) {
-      const unsigned PromotedResultSize = promoteScalarArgumentSize(ResultSize);
-      Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, {MVT::Other, MVT::Glue},
-                          {Chain, GetI32(PromotedResultSize), InGlue});
-      InGlue = Chain.getValue(1);
-    } else {
-      Chain = DAG.getNode(
-          NVPTXISD::DeclareRetParam, dl, {MVT::Other, MVT::Glue},
-          {Chain, GetI32(RetAlign->value()), GetI32(ResultSize / 8), InGlue});
-      InGlue = Chain.getValue(1);
-    }
+    const SDValue RetDeclare = [&]() {
+      const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
+      const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy);
+      if (shouldPassAsArray(RetTy)) {
+        const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
+        return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
+                           {MVT::Other, MVT::Glue},
+                           {Chain, RetSymbol, GetI32(RetAlign.value()),
+                            GetI32(ResultSize / 8), InGlue});
+      }
+      const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize);
+      return DAG.getNode(
+          NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
+          {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue});
+    }();
+    Chain = RetDeclare.getValue(0);
+    InGlue = RetDeclare.getValue(1);
   }
 
   const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
@@ -1760,7 +1776,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // The prototype is embedded in a string and put as the operand for a
     // CallPrototype SDNode which will print out to the value of the string.
     std::string Proto =
-        getPrototype(DL, RetTy, Args, CLI.Outs, RetAlign,
+        getPrototype(DL, RetTy, Args, CLI.Outs,
                      HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
                      UniqueCallSite);
     const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
@@ -1773,11 +1789,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (ConvertToIndirectCall) {
     // Copy the function ptr to a ptx register and use the register to call the
     // function.
-    EVT DestVT = Callee.getValueType();
-    MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
+    const MVT DestVT = Callee.getValueType().getSimpleVT();
+    MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    unsigned DestReg =
-        RegInfo.createVirtualRegister(TLI.getRegClassFor(DestVT.getSimpleVT()));
+    Register DestReg = MRI.createVirtualRegister(TLI.getRegClassFor(DestVT));
     auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee);
     Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT);
   }
@@ -1810,7 +1825,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
     assert(VTs.size() == Ins.size() && "Bad value decomposition");
 
-    assert(RetAlign && "RetAlign is guaranteed to be set");
+    const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
 
     // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
     // 32-bits are sign extended or zero extended, depending on whether
@@ -1818,17 +1833,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const bool ExtendIntegerRetVal =
         RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
 
-    const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, *RetAlign);
+    const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
     unsigned I = 0;
     for (const unsigned VectorizedSize : VectorInfo) {
-      EVT TheLoadType = VTs[I];
+      EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]);
       EVT EltType = Ins[I].VT;
-      const Align EltAlign = commonAlignment(*RetAlign, Offsets[I]);
+      const Align EltAlign = commonAlignment(RetAlign, Offsets[I]);
 
-      if (auto PromotedVT = PromoteScalarIntegerPTX(TheLoadType)) {
-        TheLoadType = *PromotedVT;
-        EltType = *PromotedVT;
-      }
+      if (TheLoadType != VTs[I])
+        EltType = TheLoadType;
 
       if (ExtendIntegerRetVal) {
         TheLoadType = MVT::i32;
@@ -1898,13 +1911,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       continue;
     }
 
-    SDValue Ret = DAG.getNode(
-        NVPTXISD::ProxyReg, dl,
-        {ProxyRegOps[I].getSimpleValueType(), MVT::Other, MVT::Glue},
-        {Chain, ProxyRegOps[I], InGlue});
-
-    Chain = Ret.getValue(1);
-    InGlue = Ret.getValue(2);
+    SDValue Ret =
+        DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(),
+                    {Chain, ProxyRegOps[I]});
 
     const EVT ExpectedVT = Ins[I].VT;
     if (!Ret.getValueType().bitsEq(ExpectedVT)) {
@@ -1914,14 +1923,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   for (SDValue &T : TempProxyRegOps) {
-    SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl,
-                               {T.getSimpleValueType(), MVT::Other, MVT::Glue},
-                               {Chain, T.getOperand(0), InGlue});
+    SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(),
+                               {Chain, T.getOperand(0)});
     DAG.ReplaceAllUsesWith(T, Repl);
     DAG.RemoveDeadNode(T.getNode());
-
-    Chain = Repl.getValue(1);
-    InGlue = Repl.getValue(2);
   }
 
   // set isTailCall to false for now, until we figure out how to express
@@ -3293,11 +3298,17 @@ bool NVPTXTargetLowering::splitValueIntoRegisterParts(
 // Name of the symbol is composed from its index and the function name.
 // Negative index corresponds to special parameter (unsized array) used for
 // passing variable arguments.
-SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
-                                            EVT v) const {
+SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
+                                            EVT T) const {
   StringRef SavedStr = nvTM->getStrPool().save(
-      getParamName(&DAG.getMachineFunction().getFunction(), idx));
-  return DAG.getExternalSymbol(SavedStr.data(), v);
+      getParamName(&DAG.getMachineFunction().getFunction(), I));
+  return DAG.getExternalSymbol(SavedStr.data(), T);
+}
+
+SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
+                                                EVT T) const {
+  const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
+  return DAG.getExternalSymbol(SavedStr.data(), T);
 }
 
 SDValue NVPTXTargetLowering::LowerFormalArguments(
@@ -3394,8 +3405,11 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         const unsigned PackingAmt =
             LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
 
-        const EVT VecVT = EVT::getVectorVT(
-            F->getContext(), LoadVT.getScalarType(), NumElts * PackingAmt);
+        const EVT VecVT =
+            NumElts == 1
+                ? LoadVT
+                : EVT::getVectorVT(F->getContext(), LoadVT.getScalarType(),
+                                   NumElts * PackingAmt);
 
         SDValue VecAddr = DAG.getObjectPtrOffset(
             dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
@@ -3409,22 +3423,16 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
         if (P.getNode())
           P.getNode()->setIROrder(Arg.getArgNo() + 1);
         for (const unsigned J : llvm::seq(NumElts)) {
-          SDValue Elt = DAG.getNode(
-              LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
-                                : ISD::EXTRACT_VECTOR_ELT,
-              dl, LoadVT, P, DAG.getVectorIdxConstant(J * PackingAmt, dl));
-
-          // Extend or truncate the element if necessary (e.g. an i8 is loaded
-          // into an i16 register)
-          const EVT ExpectedVT = ArgIns[I + J].VT;
-          assert((Elt.getValueType() == ExpectedVT ||
-                  (ExpectedVT.isInteger() && Elt.getValueType().isInteger())) &&
-                 "Non-integer argument type size mismatch");
-          if (ExpectedVT.bitsGT(Elt.getValueType()))
-            Elt = DAG.getNode(getExtOpcode(ArgIns[I + J].Flags), dl, ExpectedVT,
-                              Elt);
-          else if (ExpectedVT.bitsLT(Elt.getValueType()))
-            Elt = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Elt);
+          SDValue Elt =
+              NumElts == 1
+                  ? P
+                  : DAG.getNode(LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
+                                                  : ISD::EXTRACT_VECTOR_ELT,
+                                dl, LoadVT, P,
+                                DAG.getVectorIdxConstant(J * PackingAmt, dl));
+
+          Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
+                                 DAG, dl);
           InVals.push_back(Elt);
         }
         I += NumElts;
@@ -3467,25 +3475,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   const auto GetRetVal = [&](unsigned I) -> SDValue {
     SDValue RetVal = OutVals[I];
-    assert(!PromoteScalarIntegerPTX(RetVal.getValueType()) &&
+    assert(promoteScalarIntegerPTX(RetVal.getValueType()) ==
+               RetVal.getValueType() &&
            "OutVal type should always be legal");
 
-    EVT VTI = VTs[I];
-    if (const auto PromotedVT = PromoteScalarIntegerPTX(VTI))
-      VTI = *PromotedVT;
-
+    const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
     const EVT StoreVT =
         ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
-
-    assert((RetVal.getValueType() == StoreVT ||
-            (StoreVT.isInteger() && RetVal.getValueType().isInteger())) &&
-           "Non-integer argument type size mismatch");
-    if (StoreVT.bitsGT(RetVal.getValueType())) {
-      RetVal = DAG.getNode(getExtOpcode(Outs[I].Flags), dl, StoreVT, RetVal);
-    } else if (StoreVT.bitsLT(RetVal.getValueType())) {
-      RetVal = DAG.getNode(ISD::TRUNCATE, dl, StoreVT, RetVal);
-    }
-    return RetVal;
+    return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
   };
 
   const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
@@ -3500,7 +3497,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     if (NumElts == 1) {
       Val = GetRetVal(I);
     } else {
-      SmallVector<SDValue, 6> StoreVals;
+      SmallVector<SDValue, 4> StoreVals;
       for (const unsigned J : llvm::seq(NumElts)) {
         SDValue ValJ = GetRetVal(I + J);
         if (ValJ.getValueType().isVector())
@@ -3514,7 +3511,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       Val = DAG.getBuildVector(VT, dl, StoreVals);
     }
 
-    SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
+    const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
     SDValue Ptr =
         DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 3a8091fecfde1..2477e1fb61595 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -25,10 +25,15 @@ enum NodeType : unsigned {
   // Start the numbering from where ISD NodeType finishes.
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
   RET_GLUE,
-  DeclareParam,
+
+  /// These nodes represent a parameter declaration. In PTX this will look like:
+  ///   .param .align 16 .b8 param0[1024];
+  ///   .param .b32 retval0;
+  ///
+  /// DeclareArrayParam(Chain, Externalsym, Align, Size, Glue)
+  /// DeclareScalarParam(Chain, Externalsym, Size, Glue)
   DeclareScalarParam,
-  DeclareRetParam,
-  DeclareRet,
+  DeclareArrayParam,
 
   /// This node represents a PTX call instruction. It's operands are as follows:
   ///
@@ -174,7 +179,6 @@ class NVPTXTargetLowering : public TargetLowering {
 
   std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
                            const SmallVectorImpl<ISD::OutputArg> &,
-                           MaybeAlign RetAlign,
                            std::optional<unsigned> FirstVAArg,
                            const CallBase &CB, unsigned UniqueCallSite) const;
 
@@ -272,8 +276,8 @@ class NVPTXTargetLowering : public TargetLowering {
   const NVPTXSubtarget &STI; // cache the subtarget here
   mutable unsigned GlobalUniqueCallSite;
 
-  SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
-
+  SDValue getParamSymbol(SelectionDAG &DAG, int I, EVT T) const;
+  SDValue getCallParamSymbol(SelectionDAG &DAG, int I, EVT T) const;
   SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 1a2515b7f66f3..441ddeeb7d667 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1990,9 +1990,9 @@ defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
 defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
 defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
 
-def SDTDeclareParamProfile :
+def SDTDeclareArrayParam :
   SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
-def SDTDeclareScalarParamProfile :
+def SDTDeclareScalarParam :
   SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
 def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
@@ -2001,22 +2001,17 @@ def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
 def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
 def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
 def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
-def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
 
-def DeclareParam :
-  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareScalarParam :
-  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareRetParam :
-  SDNode<"NVPTXISD::DeclareRetParam",
-         SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>,
+def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
+
+
+def declare_array_param :
+  SDNode<"NVPTXISD::DeclareArrayParam", SDTDeclareArrayParam,
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareRet :
-  SDNode<"NVPTXISD::DeclareRet",
-         SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>,
+def declare_scalar_param :
+  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam,
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+
 def LoadParam :
   SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
          [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
@@ -2037,9 +2032,8 @@ def StoreParamV4 :
          [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def MoveParam :
   SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
-def ProxyReg :
-  SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def proxy_reg :
+  SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>;
 
   /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
   ///      NumParams, Callee, Proto, InGlue)
@@ -2188,23 +2182,17 @@ defm StoreParamV2F64  : StoreParamV2Inst<B64, f64imm, ".b64">;
 
 defm StoreParamV4F32  : StoreParamV4Inst<B32, f32imm, ".b32">;
 
-def DeclareRetMemInst :
-  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size),
-            ".param .align $align .b8 retval0[$size];",
-            [(DeclareRetParam imm:$align, imm:$size)]>;
-def DeclareRetScalarInst :
-  NVPTXInst<(outs), (ins i32imm:$size),
-            ".param .b$size retval0;",
-            [(DeclareRet imm:$size)]>;
-
-def DeclareParamInst :
-  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
-            ".param .align $align .b8 param$a[$size];",
-            [(DeclareParam imm:$align, imm:$a, imm:$size)]>;
-def DeclareScalarParamInst :
+def DECLARE_PARAM_array :
+  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size),
+            ".param .align $align .b8 \t$a[$size];", []>;
+def DECLARE_PARAM_scalar :
   NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
-            ".param .b$size param$a;",
-            [(DeclareScalarParam imm:$a, imm:$size)]>;
+            ".param .b$size \t$a;", []>;
+
+def : Pat<(declare_array_param externalsym:$a, imm:$align, imm:$size),
+          (DECLARE_PARAM_array (to_texternsym $a), imm:$align, imm:$size)>;
+def : Pat<(declare_scalar_param externalsym:$a, imm:$size),
+          (DECLARE_PARAM_scalar (to_texternsym $a), imm:$size)>;
 
 foreach t = [I32RT, I64RT] in {
   defvar inst_name = "MOV" # t.Size # "_PARAM";
@@ -2217,7 +2205,7 @@ multiclass ProxyRegInst<string SzStr, NVPTXRegClass rc> {
   def NAME : BasicNVPTXInst<(outs rc:$dst), (ins rc:$src),
                  "mov." # SzStr>;
   foreach vt = rc.RegTypes in
-    def : Pat<(vt (ProxyReg vt:$src)), (!cast<NVPTXInst>(NAME) $src)>;
+    def : Pat<(vt (proxy_reg vt:$src)), (!cast<NVPTXInst>(NAME) $src)>;
 }
 
 defm ProxyRegB1  : ProxyRegInst<"pred", B1>;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
index 65a077d67e4ba..c99860cc5cc1b 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -14,17 +14,17 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
@@ -41,7 +41,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB0_1;
 ; SM60-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
     ret i8 %new
@@ -59,17 +59,17 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
@@ -86,7 +86,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB1_1;
 ; SM60-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
     ret i8 %new
@@ -104,17 +104,17 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
@@ -131,7 +131,7 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB2_1;
 ; SM60-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
     ret i8 %new
@@ -149,17 +149,17 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
@@ -177,7 +177,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB3_1;
 ; SM60-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
     ret i8 %new
@@ -195,17 +195,17 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
@@ -223,7 +223,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    @%p2 bra $L__BB4_1;
 ; SM60-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
     ret i8 %new
@@ -241,17 +241,17 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
@@ -269,7 +269,7 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    @%p2 bra $L__BB5_1;
 ; SM60-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
     ret i8 %new
@@ -287,18 +287,18 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
@@ -316,7 +316,7 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB6_1;
 ; SM60-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
     ret i8 %new
@@ -334,18 +334,18 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
@@ -363,7 +363,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    @%p2 bra $L__BB7_1;
 ; SM60-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
     ret i8 %new
@@ -381,18 +381,18 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
@@ -410,7 +410,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    @%p2 bra $L__BB8_1;
 ; SM60-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
     ret i8 %new
@@ -428,17 +428,17 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
@@ -456,7 +456,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB9_1;
 ; SM60-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
     ret i8 %new
@@ -474,17 +474,17 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
@@ -502,7 +502,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    @%p2 bra $L__BB10_1;
 ; SM60-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
     ret i8 %new
@@ -520,17 +520,17 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
@@ -548,7 +548,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    @%p2 bra $L__BB11_1;
 ; SM60-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
     ret i8 %new
@@ -566,17 +566,17 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
@@ -594,7 +594,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB12_1;
 ; SM60-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
     ret i8 %new
@@ -612,17 +612,17 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
@@ -640,7 +640,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB13_1;
 ; SM60-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
     ret i8 %new
@@ -658,17 +658,17 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
@@ -686,7 +686,7 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB14_1;
 ; SM60-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
     ret i8 %new
@@ -704,18 +704,18 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
@@ -733,7 +733,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB15_1;
 ; SM60-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
     ret i8 %new
@@ -751,18 +751,18 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
@@ -780,7 +780,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB16_1;
 ; SM60-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
     ret i8 %new
@@ -798,18 +798,18 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
@@ -827,7 +827,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB17_1;
 ; SM60-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
     ret i8 %new
@@ -845,18 +845,18 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
@@ -873,7 +873,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB18_1;
 ; SM60-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
     ret i8 %new
@@ -891,18 +891,18 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
@@ -919,7 +919,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB19_1;
 ; SM60-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
     ret i8 %new
@@ -937,18 +937,18 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
@@ -965,7 +965,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    mov.b32 %r20, %r8;
 ; SM60-NEXT:    @%p2 bra $L__BB20_1;
 ; SM60-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
     ret i8 %new
@@ -983,18 +983,18 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [release_acquire_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
@@ -1012,7 +1012,7 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB21_1;
 ; SM60-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
     ret i8 %new
@@ -1030,18 +1030,18 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
@@ -1059,7 +1059,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB22_1;
 ; SM60-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
     ret i8 %new
@@ -1077,18 +1077,18 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [release_acquire_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
@@ -1106,7 +1106,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB23_1;
 ; SM60-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
     ret i8 %new
@@ -1124,18 +1124,18 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
@@ -1153,7 +1153,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB24_1;
 ; SM60-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
     ret i8 %new
@@ -1171,18 +1171,18 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
@@ -1200,7 +1200,7 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB25_1;
 ; SM60-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
     ret i8 %new
@@ -1218,18 +1218,18 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
@@ -1247,7 +1247,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB26_1;
 ; SM60-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
     ret i8 %new
@@ -1265,18 +1265,18 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
@@ -1294,7 +1294,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB27_1;
 ; SM60-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
     ret i8 %new
@@ -1312,18 +1312,18 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
@@ -1341,7 +1341,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    @%p2 bra $L__BB28_1;
 ; SM60-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
     ret i8 %new
@@ -1359,18 +1359,18 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
@@ -1388,7 +1388,7 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    @%p2 bra $L__BB29_1;
 ; SM60-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
     ret i8 %new
@@ -1406,18 +1406,18 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
@@ -1435,7 +1435,7 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB30_1;
 ; SM60-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1453,18 +1453,18 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
@@ -1482,7 +1482,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB31_1;
 ; SM60-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1500,18 +1500,18 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
@@ -1529,7 +1529,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB32_1;
 ; SM60-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1547,18 +1547,18 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
@@ -1576,7 +1576,7 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB33_1;
 ; SM60-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
     ret i8 %new
@@ -1594,18 +1594,18 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
@@ -1623,7 +1623,7 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB34_1;
 ; SM60-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
     ret i8 %new
@@ -1641,18 +1641,18 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
@@ -1670,7 +1670,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB35_1;
 ; SM60-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
     ret i8 %new
@@ -1688,18 +1688,18 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
@@ -1717,7 +1717,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB36_1;
 ; SM60-NEXT:  $L__BB36_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
     ret i8 %new
@@ -1735,18 +1735,18 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
@@ -1764,7 +1764,7 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    @%p2 bra $L__BB37_1;
 ; SM60-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
     ret i8 %new
@@ -1782,18 +1782,18 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
@@ -1811,7 +1811,7 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM60-NEXT:    @%p2 bra $L__BB38_1;
 ; SM60-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
     ret i8 %new
@@ -1829,18 +1829,18 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
@@ -1858,7 +1858,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB39_1;
 ; SM60-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
     ret i8 %new
@@ -1876,18 +1876,18 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
@@ -1905,7 +1905,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB40_1;
 ; SM60-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
     ret i8 %new
@@ -1923,18 +1923,18 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
@@ -1952,7 +1952,7 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB41_1;
 ; SM60-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
     ret i8 %new
@@ -1970,18 +1970,18 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
@@ -1999,7 +1999,7 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB42_1;
 ; SM60-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
     ret i8 %new
@@ -2017,18 +2017,18 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
@@ -2046,7 +2046,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB43_1;
 ; SM60-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
     ret i8 %new
@@ -2064,18 +2064,18 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
 ; SM60-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
 ; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1];
 ; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM60-NEXT:    and.b32 %r10, %r9, 3;
-; SM60-NEXT:    shl.b32 %r1, %r10, 3;
-; SM60-NEXT:    mov.b32 %r11, 255;
-; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM60-NEXT:    not.b32 %r2, %r12;
-; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM60-NEXT:    and.b32 %r14, %r13, 255;
-; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM60-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
-; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 255;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    and.b32 %r15, %r14, 255;
+; SM60-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM60-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM60-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM60-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
@@ -2093,7 +2093,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM60-NEXT:    @%p2 bra $L__BB44_1;
 ; SM60-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    membar.sys;
-; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM60-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
     ret i8 %new
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
index 7107fbcf6eb54..68de517f65bb9 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
@@ -14,17 +14,17 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
@@ -41,7 +41,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB0_1;
 ; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
     ret i8 %new
@@ -59,17 +59,17 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
@@ -86,7 +86,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB1_1;
 ; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
     ret i8 %new
@@ -104,17 +104,17 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
@@ -131,7 +131,7 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB2_1;
 ; SM70-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
     ret i8 %new
@@ -149,17 +149,17 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
@@ -177,7 +177,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB3_1;
 ; SM70-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
     ret i8 %new
@@ -195,17 +195,17 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
@@ -223,7 +223,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    @%p2 bra $L__BB4_1;
 ; SM70-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
     ret i8 %new
@@ -241,17 +241,17 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
@@ -269,7 +269,7 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    @%p2 bra $L__BB5_1;
 ; SM70-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
     ret i8 %new
@@ -287,18 +287,18 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
@@ -316,7 +316,7 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB6_1;
 ; SM70-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
     ret i8 %new
@@ -334,18 +334,18 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
@@ -363,7 +363,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    @%p2 bra $L__BB7_1;
 ; SM70-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
     ret i8 %new
@@ -381,18 +381,18 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
@@ -410,7 +410,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    @%p2 bra $L__BB8_1;
 ; SM70-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
     ret i8 %new
@@ -428,17 +428,17 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
@@ -456,7 +456,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB9_1;
 ; SM70-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
     ret i8 %new
@@ -474,17 +474,17 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
@@ -502,7 +502,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    @%p2 bra $L__BB10_1;
 ; SM70-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
     ret i8 %new
@@ -520,17 +520,17 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
@@ -548,7 +548,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    @%p2 bra $L__BB11_1;
 ; SM70-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
     ret i8 %new
@@ -566,17 +566,17 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
@@ -594,7 +594,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB12_1;
 ; SM70-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
     ret i8 %new
@@ -612,17 +612,17 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
@@ -640,7 +640,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB13_1;
 ; SM70-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
     ret i8 %new
@@ -658,17 +658,17 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
@@ -686,7 +686,7 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB14_1;
 ; SM70-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
     ret i8 %new
@@ -704,18 +704,18 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
@@ -733,7 +733,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB15_1;
 ; SM70-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
     ret i8 %new
@@ -751,18 +751,18 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
@@ -780,7 +780,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB16_1;
 ; SM70-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
     ret i8 %new
@@ -798,18 +798,18 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
@@ -827,7 +827,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB17_1;
 ; SM70-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
     ret i8 %new
@@ -845,18 +845,18 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
@@ -873,7 +873,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB18_1;
 ; SM70-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
     ret i8 %new
@@ -891,18 +891,18 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
@@ -919,7 +919,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB19_1;
 ; SM70-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
     ret i8 %new
@@ -937,18 +937,18 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
@@ -965,7 +965,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB20_1;
 ; SM70-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
     ret i8 %new
@@ -983,18 +983,18 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [release_acquire_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
@@ -1012,7 +1012,7 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB21_1;
 ; SM70-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
     ret i8 %new
@@ -1030,18 +1030,18 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
@@ -1059,7 +1059,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB22_1;
 ; SM70-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
     ret i8 %new
@@ -1077,18 +1077,18 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [release_acquire_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
@@ -1106,7 +1106,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB23_1;
 ; SM70-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
     ret i8 %new
@@ -1124,18 +1124,18 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
@@ -1153,7 +1153,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB24_1;
 ; SM70-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
     ret i8 %new
@@ -1171,18 +1171,18 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
@@ -1200,7 +1200,7 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB25_1;
 ; SM70-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
     ret i8 %new
@@ -1218,18 +1218,18 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
@@ -1247,7 +1247,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB26_1;
 ; SM70-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
     ret i8 %new
@@ -1265,18 +1265,18 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
@@ -1294,7 +1294,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB27_1;
 ; SM70-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
     ret i8 %new
@@ -1312,18 +1312,18 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
@@ -1341,7 +1341,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    @%p2 bra $L__BB28_1;
 ; SM70-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
     ret i8 %new
@@ -1359,18 +1359,18 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
@@ -1388,7 +1388,7 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    @%p2 bra $L__BB29_1;
 ; SM70-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
     ret i8 %new
@@ -1406,18 +1406,18 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
@@ -1435,7 +1435,7 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB30_1;
 ; SM70-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1453,18 +1453,18 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
@@ -1482,7 +1482,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB31_1;
 ; SM70-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1500,18 +1500,18 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
@@ -1529,7 +1529,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB32_1;
 ; SM70-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1547,18 +1547,18 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
@@ -1576,7 +1576,7 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB33_1;
 ; SM70-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
     ret i8 %new
@@ -1594,18 +1594,18 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
@@ -1623,7 +1623,7 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB34_1;
 ; SM70-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
     ret i8 %new
@@ -1641,18 +1641,18 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
@@ -1670,7 +1670,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB35_1;
 ; SM70-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
     ret i8 %new
@@ -1688,18 +1688,18 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
@@ -1717,7 +1717,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB36_1;
 ; SM70-NEXT:  $L__BB36_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
     ret i8 %new
@@ -1735,18 +1735,18 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
@@ -1764,7 +1764,7 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    @%p2 bra $L__BB37_1;
 ; SM70-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
     ret i8 %new
@@ -1782,18 +1782,18 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
@@ -1811,7 +1811,7 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM70-NEXT:    @%p2 bra $L__BB38_1;
 ; SM70-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
     ret i8 %new
@@ -1829,18 +1829,18 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
@@ -1858,7 +1858,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB39_1;
 ; SM70-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
     ret i8 %new
@@ -1876,18 +1876,18 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
@@ -1905,7 +1905,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB40_1;
 ; SM70-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
     ret i8 %new
@@ -1923,18 +1923,18 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
@@ -1952,7 +1952,7 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB41_1;
 ; SM70-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
     ret i8 %new
@@ -1970,18 +1970,18 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
@@ -1999,7 +1999,7 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB42_1;
 ; SM70-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
     ret i8 %new
@@ -2017,18 +2017,18 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
@@ -2046,7 +2046,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB43_1;
 ; SM70-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
     ret i8 %new
@@ -2064,18 +2064,18 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
@@ -2093,7 +2093,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB44_1;
 ; SM70-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
     ret i8 %new
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
index f289c3cf3d509..e20f988577282 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
@@ -14,17 +14,17 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_generic_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
@@ -41,7 +41,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB0_1;
 ; SM90-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
     ret i8 %new
@@ -59,17 +59,17 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_global_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
@@ -86,7 +86,7 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB1_1;
 ; SM90-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
     ret i8 %new
@@ -104,17 +104,17 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_monotonic_i8_shared_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
@@ -131,7 +131,7 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB2_1;
 ; SM90-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
     ret i8 %new
@@ -149,17 +149,17 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_generic_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
@@ -177,7 +177,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB3_1;
 ; SM90-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
     ret i8 %new
@@ -195,17 +195,17 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_global_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
@@ -223,7 +223,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    @%p2 bra $L__BB4_1;
 ; SM90-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
     ret i8 %new
@@ -241,17 +241,17 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_acquire_i8_shared_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
@@ -269,7 +269,7 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    @%p2 bra $L__BB5_1;
 ; SM90-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
     ret i8 %new
@@ -287,18 +287,18 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
@@ -316,7 +316,7 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB6_1;
 ; SM90-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
     ret i8 %new
@@ -334,18 +334,18 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
@@ -363,7 +363,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    @%p2 bra $L__BB7_1;
 ; SM90-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
     ret i8 %new
@@ -381,18 +381,18 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [monotonic_seq_cst_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
@@ -410,7 +410,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    @%p2 bra $L__BB8_1;
 ; SM90-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
     ret i8 %new
@@ -428,17 +428,17 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_generic_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
@@ -456,7 +456,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB9_1;
 ; SM90-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
     ret i8 %new
@@ -474,17 +474,17 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_global_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
@@ -502,7 +502,7 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    @%p2 bra $L__BB10_1;
 ; SM90-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
     ret i8 %new
@@ -520,17 +520,17 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_monotonic_i8_shared_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
@@ -548,7 +548,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    @%p2 bra $L__BB11_1;
 ; SM90-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
     ret i8 %new
@@ -566,17 +566,17 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_generic_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
@@ -594,7 +594,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB12_1;
 ; SM90-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
     ret i8 %new
@@ -612,17 +612,17 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_global_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
@@ -640,7 +640,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB13_1;
 ; SM90-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
     ret i8 %new
@@ -658,17 +658,17 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_acquire_i8_shared_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
@@ -686,7 +686,7 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB14_1;
 ; SM90-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
     ret i8 %new
@@ -704,18 +704,18 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
@@ -733,7 +733,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB15_1;
 ; SM90-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
     ret i8 %new
@@ -751,18 +751,18 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
@@ -780,7 +780,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB16_1;
 ; SM90-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
     ret i8 %new
@@ -798,18 +798,18 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acquire_seq_cst_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
@@ -827,7 +827,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB17_1;
 ; SM90-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
     ret i8 %new
@@ -845,18 +845,18 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
@@ -873,7 +873,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB18_1;
 ; SM90-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
     ret i8 %new
@@ -891,18 +891,18 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
@@ -919,7 +919,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB19_1;
 ; SM90-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
     ret i8 %new
@@ -937,18 +937,18 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [release_monotonic_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_monotonic_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
@@ -965,7 +965,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    mov.b32 %r20, %r8;
 ; SM90-NEXT:    @%p2 bra $L__BB20_1;
 ; SM90-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
     ret i8 %new
@@ -983,18 +983,18 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_acquire_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
@@ -1012,7 +1012,7 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB21_1;
 ; SM90-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
     ret i8 %new
@@ -1030,18 +1030,18 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_acquire_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
@@ -1059,7 +1059,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB22_1;
 ; SM90-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
     ret i8 %new
@@ -1077,18 +1077,18 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_acquire_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_acquire_i8_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [release_acquire_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_acquire_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
@@ -1106,7 +1106,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB23_1;
 ; SM90-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
     ret i8 %new
@@ -1124,18 +1124,18 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
@@ -1153,7 +1153,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB24_1;
 ; SM90-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
     ret i8 %new
@@ -1171,18 +1171,18 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
@@ -1200,7 +1200,7 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB25_1;
 ; SM90-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
     ret i8 %new
@@ -1218,18 +1218,18 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [release_seq_cst_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
@@ -1247,7 +1247,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB26_1;
 ; SM90-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
     ret i8 %new
@@ -1265,18 +1265,18 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
@@ -1294,7 +1294,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB27_1;
 ; SM90-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
     ret i8 %new
@@ -1312,18 +1312,18 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
@@ -1341,7 +1341,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    @%p2 bra $L__BB28_1;
 ; SM90-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
     ret i8 %new
@@ -1359,18 +1359,18 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_monotonic_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
@@ -1388,7 +1388,7 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    @%p2 bra $L__BB29_1;
 ; SM90-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
     ret i8 %new
@@ -1406,18 +1406,18 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
@@ -1435,7 +1435,7 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB30_1;
 ; SM90-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1453,18 +1453,18 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
@@ -1482,7 +1482,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB31_1;
 ; SM90-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1500,18 +1500,18 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0];
 ; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_acquire_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
@@ -1529,7 +1529,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB32_1;
 ; SM90-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
     ret i8 %new
@@ -1547,18 +1547,18 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
@@ -1576,7 +1576,7 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB33_1;
 ; SM90-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
     ret i8 %new
@@ -1594,18 +1594,18 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
@@ -1623,7 +1623,7 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB34_1;
 ; SM90-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
     ret i8 %new
@@ -1641,18 +1641,18 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [acq_rel_seq_cst_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
@@ -1670,7 +1670,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB35_1;
 ; SM90-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
     ret i8 %new
@@ -1688,18 +1688,18 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
@@ -1717,7 +1717,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB36_1;
 ; SM90-NEXT:  $L__BB36_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
     ret i8 %new
@@ -1735,18 +1735,18 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
@@ -1764,7 +1764,7 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    @%p2 bra $L__BB37_1;
 ; SM90-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
     ret i8 %new
@@ -1782,18 +1782,18 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_monotonic_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
@@ -1811,7 +1811,7 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
 ; SM90-NEXT:    @%p2 bra $L__BB38_1;
 ; SM90-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
     ret i8 %new
@@ -1829,18 +1829,18 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
@@ -1858,7 +1858,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB39_1;
 ; SM90-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
     ret i8 %new
@@ -1876,18 +1876,18 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
@@ -1905,7 +1905,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB40_1;
 ; SM90-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
     ret i8 %new
@@ -1923,18 +1923,18 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_acquire_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
@@ -1952,7 +1952,7 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB41_1;
 ; SM90-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
     ret i8 %new
@@ -1970,18 +1970,18 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_generic_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
@@ -1999,7 +1999,7 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB42_1;
 ; SM90-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
     ret i8 %new
@@ -2017,18 +2017,18 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_global_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.global.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
@@ -2046,7 +2046,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB43_1;
 ; SM90-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
     ret i8 %new
@@ -2064,18 +2064,18 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
 ; SM90-NEXT:    ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
 ; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.b8 %r9, [seq_cst_seq_cst_i8_shared_param_1];
 ; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM90-NEXT:    and.b32 %r10, %r9, 3;
-; SM90-NEXT:    shl.b32 %r1, %r10, 3;
-; SM90-NEXT:    mov.b32 %r11, 255;
-; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM90-NEXT:    not.b32 %r2, %r12;
-; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM90-NEXT:    and.b32 %r14, %r13, 255;
-; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM90-NEXT:    ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
-; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 255;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    and.b32 %r15, %r14, 255;
+; SM90-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM90-NEXT:    ld.shared.b32 %r16, [%rd1];
 ; SM90-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM90-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
@@ -2093,7 +2093,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
 ; SM90-NEXT:    @%p2 bra $L__BB44_1;
 ; SM90-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
 ; SM90-NEXT:    fence.acquire.sys;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM90-NEXT:    ret;
     %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
     ret i8 %new
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index 9eeff9d7c2b75..85414a2ab04e8 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -21,17 +21,17 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM30-NEXT:    and.b32 %r10, %r9, 3;
-; SM30-NEXT:    shl.b32 %r1, %r10, 3;
-; SM30-NEXT:    mov.b32 %r11, 255;
-; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM30-NEXT:    not.b32 %r2, %r12;
-; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM30-NEXT:    and.b32 %r14, %r13, 255;
-; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.b8 %r15, [relaxed_sys_i8_param_1];
-; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    ld.param.b8 %r9, [relaxed_sys_i8_param_1];
+; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT:    and.b32 %r11, %r10, 3;
+; SM30-NEXT:    shl.b32 %r1, %r11, 3;
+; SM30-NEXT:    mov.b32 %r12, 255;
+; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM30-NEXT:    not.b32 %r2, %r13;
+; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT:    and.b32 %r15, %r14, 255;
+; SM30-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM30-NEXT:    ld.b32 %r16, [%rd1];
 ; SM30-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM30-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
@@ -48,7 +48,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    mov.b32 %r20, %r8;
 ; SM30-NEXT:    @%p2 bra $L__BB0_1;
 ; SM30-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: relaxed_sys_i8(
@@ -62,17 +62,17 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [relaxed_sys_i8_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [relaxed_sys_i8_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
@@ -89,7 +89,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB0_1;
 ; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: relaxed_sys_i8(
 ; SM90:       {
@@ -147,17 +147,17 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    ld.param.b8 %rs1, [acquire_sys_i8_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [acquire_sys_i8_param_0];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM30-NEXT:    and.b32 %r10, %r9, 3;
-; SM30-NEXT:    shl.b32 %r1, %r10, 3;
-; SM30-NEXT:    mov.b32 %r11, 255;
-; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM30-NEXT:    not.b32 %r2, %r12;
-; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM30-NEXT:    and.b32 %r14, %r13, 255;
-; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.b8 %r15, [acquire_sys_i8_param_1];
-; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    ld.param.b8 %r9, [acquire_sys_i8_param_1];
+; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT:    and.b32 %r11, %r10, 3;
+; SM30-NEXT:    shl.b32 %r1, %r11, 3;
+; SM30-NEXT:    mov.b32 %r12, 255;
+; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM30-NEXT:    not.b32 %r2, %r13;
+; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT:    and.b32 %r15, %r14, 255;
+; SM30-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM30-NEXT:    ld.b32 %r16, [%rd1];
 ; SM30-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM30-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
@@ -175,7 +175,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    @%p2 bra $L__BB1_1;
 ; SM30-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: acquire_sys_i8(
@@ -189,17 +189,17 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acquire_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acquire_sys_i8_param_0];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acquire_sys_i8_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.param.b8 %r9, [acquire_sys_i8_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
@@ -217,7 +217,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB1_1;
 ; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acquire_sys_i8(
 ; SM90:       {
@@ -276,18 +276,18 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    ld.param.b8 %rs1, [release_sys_i8_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [release_sys_i8_param_0];
 ; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ld.param.b8 %r9, [release_sys_i8_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM30-NEXT:    and.b32 %r10, %r9, 3;
-; SM30-NEXT:    shl.b32 %r1, %r10, 3;
-; SM30-NEXT:    mov.b32 %r11, 255;
-; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM30-NEXT:    not.b32 %r2, %r12;
-; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM30-NEXT:    and.b32 %r14, %r13, 255;
-; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.b8 %r15, [release_sys_i8_param_1];
-; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT:    and.b32 %r11, %r10, 3;
+; SM30-NEXT:    shl.b32 %r1, %r11, 3;
+; SM30-NEXT:    mov.b32 %r12, 255;
+; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM30-NEXT:    not.b32 %r2, %r13;
+; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT:    and.b32 %r15, %r14, 255;
+; SM30-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM30-NEXT:    ld.b32 %r16, [%rd1];
 ; SM30-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM30-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
@@ -304,7 +304,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    mov.b32 %r20, %r8;
 ; SM30-NEXT:    @%p2 bra $L__BB2_1;
 ; SM30-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
-; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: release_sys_i8(
@@ -318,18 +318,18 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [release_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [release_sys_i8_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [release_sys_i8_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [release_sys_i8_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
@@ -346,7 +346,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    mov.b32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB2_1;
 ; SM70-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: release_sys_i8(
 ; SM90:       {
@@ -405,18 +405,18 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
 ; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ld.param.b8 %r9, [acq_rel_sys_i8_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM30-NEXT:    and.b32 %r10, %r9, 3;
-; SM30-NEXT:    shl.b32 %r1, %r10, 3;
-; SM30-NEXT:    mov.b32 %r11, 255;
-; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM30-NEXT:    not.b32 %r2, %r12;
-; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM30-NEXT:    and.b32 %r14, %r13, 255;
-; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
-; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT:    and.b32 %r11, %r10, 3;
+; SM30-NEXT:    shl.b32 %r1, %r11, 3;
+; SM30-NEXT:    mov.b32 %r12, 255;
+; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM30-NEXT:    not.b32 %r2, %r13;
+; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT:    and.b32 %r15, %r14, 255;
+; SM30-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM30-NEXT:    ld.b32 %r16, [%rd1];
 ; SM30-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM30-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
@@ -434,7 +434,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    @%p2 bra $L__BB3_1;
 ; SM30-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: acq_rel_sys_i8(
@@ -448,18 +448,18 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
 ; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [acq_rel_sys_i8_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
@@ -477,7 +477,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB3_1;
 ; SM70-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: acq_rel_sys_i8(
 ; SM90:       {
@@ -537,18 +537,18 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
 ; SM30-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
 ; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ld.param.b8 %r9, [seq_cst_sys_i8_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM30-NEXT:    and.b32 %r10, %r9, 3;
-; SM30-NEXT:    shl.b32 %r1, %r10, 3;
-; SM30-NEXT:    mov.b32 %r11, 255;
-; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM30-NEXT:    not.b32 %r2, %r12;
-; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM30-NEXT:    and.b32 %r14, %r13, 255;
-; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
-; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT:    and.b32 %r11, %r10, 3;
+; SM30-NEXT:    shl.b32 %r1, %r11, 3;
+; SM30-NEXT:    mov.b32 %r12, 255;
+; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM30-NEXT:    not.b32 %r2, %r13;
+; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT:    and.b32 %r15, %r14, 255;
+; SM30-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM30-NEXT:    ld.b32 %r16, [%rd1];
 ; SM30-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM30-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
@@ -566,7 +566,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    @%p2 bra $L__BB4_1;
 ; SM30-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM30-NEXT:    membar.sys;
-; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM30-NEXT:    ret;
 ;
 ; SM70-LABEL: seq_cst_sys_i8(
@@ -580,18 +580,18 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
 ; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.b8 %r9, [seq_cst_sys_i8_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
-; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 255;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    and.b32 %r15, %r14, 255;
+; SM70-NEXT:    shl.b32 %r3, %r15, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.b32 %r16, [%rd1];
 ; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
@@ -609,7 +609,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    @%p2 bra $L__BB4_1;
 ; SM70-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
 ; SM70-NEXT:    fence.acq_rel.sys;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
 ; SM90-LABEL: seq_cst_sys_i8(
 ; SM90:       {
diff --git a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
index 8a0c0f8c3b452..a2fc8da3f1e61 100644
--- a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
+++ b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll
@@ -24,11 +24,11 @@ define i16 @cvt_i16_i32(i32 %x) {
 define i16 @cvt_i16_i64(i64 %x) {
 ; CHECK-LABEL: cvt_i16_i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd1, [cvt_i16_i64_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %rd1;
+; CHECK-NEXT:    ld.param.b16 %r1, [cvt_i16_i64_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %a = trunc i64 %x to i16
   ret i16 %a
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index b1eadf381d3b4..f37777ab954e2 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -158,27 +158,24 @@ define i16  @test_v8i8(i64 %a) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<16>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b64 %rd1, [test_v8i8_param_0];
-; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_v8i8_param_0];
+; CHECK-NEXT:    bfe.s32 %r3, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs2, %r4;
-; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r5, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs3, %r5;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs5, %r7;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs6, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs7, %r9;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs8, %r10;
 ; CHECK-NEXT:    add.s16 %rs9, %rs1, %rs2;
 ; CHECK-NEXT:    add.s16 %rs10, %rs3, %rs4;
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 0a2cd81ac904c..321a6240df098 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -121,7 +121,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p
 define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-LABEL: grid_const_escape(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<3>;
+; PTX-NEXT:    .reg .b32 %r<2>;
 ; PTX-NEXT:    .reg .b64 %rd<4>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
@@ -153,7 +153,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
 ; PTX-NEXT:    .local .align 4 .b8 __local_depot4[4];
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
-; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b32 %r<3>;
 ; PTX-NEXT:    .reg .b64 %rd<8>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
@@ -255,7 +255,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4
 define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escape(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<5>;
+; PTX-NEXT:    .reg .b32 %r<4>;
 ; PTX-NEXT:    .reg .b64 %rd<6>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
@@ -295,7 +295,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
 define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escapemem(
 ; PTX:       {
-; PTX-NEXT:    .reg .b32 %r<6>;
+; PTX-NEXT:    .reg .b32 %r<5>;
 ; PTX-NEXT:    .reg .b64 %rd<6>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index 6f334b075241b..c165de7ffff03 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -31,7 +31,7 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %
 ; PTX-LABEL: load_alignment(
 ; PTX:       {
 ; PTX-NEXT:    .reg .b32 %r<4>;
-; PTX-NEXT:    .reg .b64 %rd<8>;
+; PTX-NEXT:    .reg .b64 %rd<7>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0: // %entry
 ; PTX-NEXT:    mov.b64 %rd1, load_alignment_param_0;
@@ -76,7 +76,7 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
 ;
 ; PTX-LABEL: load_padding(
 ; PTX:       {
-; PTX-NEXT:    .reg .b64 %rd<5>;
+; PTX-NEXT:    .reg .b64 %rd<4>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
 ; PTX-NEXT:    mov.b64 %rd1, load_padding_param_0;
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index 2e9eb6913ac0e..8401f457418d1 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -8,7 +8,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-LABEL: wombat(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<11>;
-; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %bb
 ; CHECK-NEXT:    ld.param.b32 %r4, [wombat_param_2];
@@ -27,11 +27,11 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    mul.lo.s32 %r7, %r10, %r3;
 ; CHECK-NEXT:    or.b32 %r8, %r4, %r7;
 ; CHECK-NEXT:    mul.lo.s32 %r9, %r2, %r8;
-; CHECK-NEXT:    cvt.rn.f64.s32 %rd3, %r9;
-; CHECK-NEXT:    cvt.rn.f64.u32 %rd4, %r10;
-; CHECK-NEXT:    add.rn.f64 %rd5, %rd4, %rd3;
-; CHECK-NEXT:    mov.b64 %rd6, 0;
-; CHECK-NEXT:    st.global.b64 [%rd6], %rd5;
+; CHECK-NEXT:    cvt.rn.f64.s32 %rd2, %r9;
+; CHECK-NEXT:    cvt.rn.f64.u32 %rd3, %r10;
+; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, %rd2;
+; CHECK-NEXT:    mov.b64 %rd5, 0;
+; CHECK-NEXT:    st.global.b64 [%rd5], %rd4;
 ; CHECK-NEXT:    mov.b32 %r10, 1;
 ; CHECK-NEXT:    bra.uni $L__BB0_1;
 bb:
diff --git a/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll b/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
index 3096b953e8d3a..160511387652c 100644
--- a/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
+++ b/llvm/test/CodeGen/NVPTX/pow2_mask_cmp.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=nvptx64 -verify-machineinstrs < %s | FileCheck %s
 ; RUN: %if ptxas %{ llc -mtriple=nvptx64 -verify-machineinstrs < %s | %ptxas-verify %}
 
@@ -10,9 +11,19 @@
 ; value will be identical regardless of the boolean representation.
 ; Check that the optimization triggers in this case.
 
-; CHECK-LABEL: @pow2_mask_cmp
-; CHECK: bfe.u32 {{%r[0-9]+}}, {{%r[0-9]+}}, 3, 1
 define i32 @pow2_mask_cmp(i32 %x) {
+; CHECK-LABEL: pow2_mask_cmp(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [pow2_mask_cmp_param_0];
+; CHECK-NEXT:    shr.u16 %rs2, %rs1, 3;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT:    and.b32 %r2, %r1, 1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %a = and i32 %x, 8
   %cmp = icmp ne i32 %a, 0
   %r = zext i1 %cmp to i32
diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
index 50d3e8049a947..6aa111932a4a5 100644
--- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll
+++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll
@@ -445,12 +445,12 @@ define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_2];
 ; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irrr_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_0];
 ; CHECK-NEXT:    { // callseq 24, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, %rs2, %rs3};
+; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs3, %rs2, %rs1};
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 24
 ; CHECK-NEXT:    ret;
@@ -467,12 +467,12 @@ define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_2];
 ; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rirr_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_0];
 ; CHECK-NEXT:    { // callseq 25, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, %rs2, %rs3};
+; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs3, 2, %rs2, %rs1};
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 25
 ; CHECK-NEXT:    ret;
@@ -489,12 +489,12 @@ define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_2];
 ; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rrir_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_0];
 ; CHECK-NEXT:    { // callseq 26, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, %rs2, 3, %rs3};
+; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs3, %rs2, 3, %rs1};
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 26
 ; CHECK-NEXT:    ret;
@@ -511,12 +511,12 @@ define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_0];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_2];
 ; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rrri_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_2];
+; CHECK-NEXT:    ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_0];
 ; CHECK-NEXT:    { // callseq 27, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, %rs2, %rs3, 4};
+; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs3, %rs2, %rs1, 4};
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 27
 ; CHECK-NEXT:    ret;
@@ -533,11 +533,11 @@ define void @st_param_v4_i8_iirr(i8 %c, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_0];
 ; CHECK-NEXT:    { // callseq 28, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, %rs1, %rs2};
+; CHECK-NEXT:    st.param.v4.b8 [param0], {1, 2, %rs2, %rs1};
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 28
 ; CHECK-NEXT:    ret;
@@ -554,11 +554,11 @@ define void @st_param_v4_i8_irir(i8 %b, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irir_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irir_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irir_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irir_param_0];
 ; CHECK-NEXT:    { // callseq 29, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, 3, %rs2};
+; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs2, 3, %rs1};
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 29
 ; CHECK-NEXT:    ret;
@@ -575,11 +575,11 @@ define void @st_param_v4_i8_irri(i8 %b, i8 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irri_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irri_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_irri_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_irri_param_0];
 ; CHECK-NEXT:    { // callseq 30, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs1, %rs2, 4};
+; CHECK-NEXT:    st.param.v4.b8 [param0], {1, %rs2, %rs1, 4};
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 30
 ; CHECK-NEXT:    ret;
@@ -596,11 +596,11 @@ define void @st_param_v4_i8_riir(i8 %a, i8 %d) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_riir_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_riir_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_riir_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_riir_param_0];
 ; CHECK-NEXT:    { // callseq 31, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, 3, %rs2};
+; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs2, 2, 3, %rs1};
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 31
 ; CHECK-NEXT:    ret;
@@ -617,11 +617,11 @@ define void @st_param_v4_i8_riri(i8 %a, i8 %c) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_riri_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_riri_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_riri_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_riri_param_0];
 ; CHECK-NEXT:    { // callseq 32, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, 2, %rs2, 4};
+; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs2, 2, %rs1, 4};
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 32
 ; CHECK-NEXT:    ret;
@@ -638,11 +638,11 @@ define void @st_param_v4_i8_rrii(i8 %a, i8 %b) {
 ; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_0];
 ; CHECK-NEXT:    { // callseq 33, 0
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
-; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs1, %rs2, 3, 4};
+; CHECK-NEXT:    st.param.v4.b8 [param0], {%rs2, %rs1, 3, 4};
 ; CHECK-NEXT:    call.uni call_v4_i8, (param0);
 ; CHECK-NEXT:    } // callseq 33
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
index e4e668018d872..87e46b1505e31 100644
--- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -103,16 +103,16 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
 ; CHECK-NEXT:    ld.param.b8 %rs6, [retval0+8];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0+16];
 ; CHECK-NEXT:    } // callseq 1
-; CHECK-NEXT:    cvt.u32.u16 %r16, %rs3;
-; CHECK-NEXT:    cvt.u32.u16 %r17, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r18, %rs5;
-; CHECK-NEXT:    cvt.u32.u16 %r19, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r15, %rs3;
+; CHECK-NEXT:    cvt.u32.u16 %r16, %rs4;
+; CHECK-NEXT:    cvt.u32.u16 %r17, %rs5;
+; CHECK-NEXT:    cvt.u32.u16 %r18, %rs6;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r19;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r18;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r17;
-; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r18;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r17;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r15;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd2;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a)
@@ -185,48 +185,48 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
 ; CHECK-NEXT:    ld.param.b8 %rs10, [retval0+16];
 ; CHECK-NEXT:    ld.param.b64 %rd32, [retval0+24];
 ; CHECK-NEXT:    } // callseq 2
-; CHECK-NEXT:    cvt.u64.u16 %rd35, %rs3;
+; CHECK-NEXT:    cvt.u64.u16 %rd33, %rs3;
+; CHECK-NEXT:    and.b64 %rd34, %rd33, 255;
+; CHECK-NEXT:    cvt.u64.u16 %rd35, %rs4;
 ; CHECK-NEXT:    and.b64 %rd36, %rd35, 255;
-; CHECK-NEXT:    cvt.u64.u16 %rd37, %rs4;
-; CHECK-NEXT:    and.b64 %rd38, %rd37, 255;
-; CHECK-NEXT:    shl.b64 %rd39, %rd38, 8;
-; CHECK-NEXT:    or.b64 %rd40, %rd36, %rd39;
-; CHECK-NEXT:    cvt.u64.u16 %rd41, %rs5;
-; CHECK-NEXT:    and.b64 %rd42, %rd41, 255;
-; CHECK-NEXT:    shl.b64 %rd43, %rd42, 16;
-; CHECK-NEXT:    or.b64 %rd44, %rd40, %rd43;
-; CHECK-NEXT:    cvt.u64.u16 %rd45, %rs6;
-; CHECK-NEXT:    and.b64 %rd46, %rd45, 255;
-; CHECK-NEXT:    shl.b64 %rd47, %rd46, 24;
-; CHECK-NEXT:    or.b64 %rd48, %rd44, %rd47;
-; CHECK-NEXT:    cvt.u64.u16 %rd49, %rs7;
-; CHECK-NEXT:    and.b64 %rd50, %rd49, 255;
-; CHECK-NEXT:    shl.b64 %rd51, %rd50, 32;
-; CHECK-NEXT:    or.b64 %rd52, %rd48, %rd51;
-; CHECK-NEXT:    cvt.u64.u16 %rd53, %rs8;
-; CHECK-NEXT:    and.b64 %rd54, %rd53, 255;
-; CHECK-NEXT:    shl.b64 %rd55, %rd54, 40;
-; CHECK-NEXT:    or.b64 %rd56, %rd52, %rd55;
-; CHECK-NEXT:    cvt.u64.u16 %rd57, %rs9;
-; CHECK-NEXT:    and.b64 %rd58, %rd57, 255;
-; CHECK-NEXT:    shl.b64 %rd59, %rd58, 48;
-; CHECK-NEXT:    or.b64 %rd60, %rd56, %rd59;
-; CHECK-NEXT:    cvt.u64.u16 %rd61, %rs10;
-; CHECK-NEXT:    shl.b64 %rd62, %rd61, 56;
-; CHECK-NEXT:    or.b64 %rd63, %rd60, %rd62;
+; CHECK-NEXT:    shl.b64 %rd37, %rd36, 8;
+; CHECK-NEXT:    or.b64 %rd38, %rd34, %rd37;
+; CHECK-NEXT:    cvt.u64.u16 %rd39, %rs5;
+; CHECK-NEXT:    and.b64 %rd40, %rd39, 255;
+; CHECK-NEXT:    shl.b64 %rd41, %rd40, 16;
+; CHECK-NEXT:    or.b64 %rd42, %rd38, %rd41;
+; CHECK-NEXT:    cvt.u64.u16 %rd43, %rs6;
+; CHECK-NEXT:    and.b64 %rd44, %rd43, 255;
+; CHECK-NEXT:    shl.b64 %rd45, %rd44, 24;
+; CHECK-NEXT:    or.b64 %rd46, %rd42, %rd45;
+; CHECK-NEXT:    cvt.u64.u16 %rd47, %rs7;
+; CHECK-NEXT:    and.b64 %rd48, %rd47, 255;
+; CHECK-NEXT:    shl.b64 %rd49, %rd48, 32;
+; CHECK-NEXT:    or.b64 %rd50, %rd46, %rd49;
+; CHECK-NEXT:    cvt.u64.u16 %rd51, %rs8;
+; CHECK-NEXT:    and.b64 %rd52, %rd51, 255;
+; CHECK-NEXT:    shl.b64 %rd53, %rd52, 40;
+; CHECK-NEXT:    or.b64 %rd54, %rd50, %rd53;
+; CHECK-NEXT:    cvt.u64.u16 %rd55, %rs9;
+; CHECK-NEXT:    and.b64 %rd56, %rd55, 255;
+; CHECK-NEXT:    shl.b64 %rd57, %rd56, 48;
+; CHECK-NEXT:    or.b64 %rd58, %rd54, %rd57;
+; CHECK-NEXT:    cvt.u64.u16 %rd59, %rs10;
+; CHECK-NEXT:    shl.b64 %rd60, %rd59, 56;
+; CHECK-NEXT:    or.b64 %rd61, %rd58, %rd60;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd31;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs2;
-; CHECK-NEXT:    st.param.b8 [func_retval0+12], %rd45;
-; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd41;
-; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd37;
-; CHECK-NEXT:    st.param.b8 [func_retval0+9], %rd35;
-; CHECK-NEXT:    shr.u64 %rd64, %rd52, 32;
+; CHECK-NEXT:    st.param.b8 [func_retval0+12], %rd43;
+; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd39;
+; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd35;
+; CHECK-NEXT:    st.param.b8 [func_retval0+9], %rd33;
+; CHECK-NEXT:    shr.u64 %rd64, %rd50, 32;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+13], %rd64;
-; CHECK-NEXT:    shr.u64 %rd65, %rd56, 40;
+; CHECK-NEXT:    shr.u64 %rd65, %rd54, 40;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rd65;
-; CHECK-NEXT:    shr.u64 %rd66, %rd60, 48;
+; CHECK-NEXT:    shr.u64 %rd66, %rd58, 48;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+15], %rd66;
-; CHECK-NEXT:    shr.u64 %rd67, %rd63, 56;
+; CHECK-NEXT:    shr.u64 %rd67, %rd61, 56;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rd67;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+24], %rd32;
 ; CHECK-NEXT:    ret;
@@ -317,16 +317,16 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
 ; CHECK-NEXT:    ld.param.b8 %rs6, [retval0+8];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0+16];
 ; CHECK-NEXT:    } // callseq 4
-; CHECK-NEXT:    cvt.u32.u16 %r16, %rs3;
-; CHECK-NEXT:    cvt.u32.u16 %r17, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r18, %rs5;
-; CHECK-NEXT:    cvt.u32.u16 %r19, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r15, %rs3;
+; CHECK-NEXT:    cvt.u32.u16 %r16, %rs4;
+; CHECK-NEXT:    cvt.u32.u16 %r17, %rs5;
+; CHECK-NEXT:    cvt.u32.u16 %r18, %rs6;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r19;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r18;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r17;
-; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r18;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r17;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r15;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd2;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a)
@@ -376,16 +376,16 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
 ; CHECK-NEXT:    ld.param.b8 %rs6, [retval0+8];
 ; CHECK-NEXT:    ld.param.b64 %rd2, [retval0+16];
 ; CHECK-NEXT:    } // callseq 5
-; CHECK-NEXT:    cvt.u32.u16 %r16, %rs3;
-; CHECK-NEXT:    cvt.u32.u16 %r17, %rs4;
-; CHECK-NEXT:    cvt.u32.u16 %r18, %rs5;
-; CHECK-NEXT:    cvt.u32.u16 %r19, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r15, %rs3;
+; CHECK-NEXT:    cvt.u32.u16 %r16, %rs4;
+; CHECK-NEXT:    cvt.u32.u16 %r17, %rs5;
+; CHECK-NEXT:    cvt.u32.u16 %r18, %rs6;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r14;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs2;
-; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r19;
-; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r18;
-; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r17;
-; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+8], %r18;
+; CHECK-NEXT:    st.param.b8 [func_retval0+7], %r17;
+; CHECK-NEXT:    st.param.b8 [func_retval0+6], %r16;
+; CHECK-NEXT:    st.param.b8 [func_retval0+5], %r15;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd2;
 ; CHECK-NEXT:    ret;
   %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a)
@@ -458,48 +458,48 @@ define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
 ; CHECK-NEXT:    ld.param.b8 %rs10, [retval0+16];
 ; CHECK-NEXT:    ld.param.b64 %rd32, [retval0+24];
 ; CHECK-NEXT:    } // callseq 6
-; CHECK-NEXT:    cvt.u64.u16 %rd35, %rs3;
+; CHECK-NEXT:    cvt.u64.u16 %rd33, %rs3;
+; CHECK-NEXT:    and.b64 %rd34, %rd33, 255;
+; CHECK-NEXT:    cvt.u64.u16 %rd35, %rs4;
 ; CHECK-NEXT:    and.b64 %rd36, %rd35, 255;
-; CHECK-NEXT:    cvt.u64.u16 %rd37, %rs4;
-; CHECK-NEXT:    and.b64 %rd38, %rd37, 255;
-; CHECK-NEXT:    shl.b64 %rd39, %rd38, 8;
-; CHECK-NEXT:    or.b64 %rd40, %rd36, %rd39;
-; CHECK-NEXT:    cvt.u64.u16 %rd41, %rs5;
-; CHECK-NEXT:    and.b64 %rd42, %rd41, 255;
-; CHECK-NEXT:    shl.b64 %rd43, %rd42, 16;
-; CHECK-NEXT:    or.b64 %rd44, %rd40, %rd43;
-; CHECK-NEXT:    cvt.u64.u16 %rd45, %rs6;
-; CHECK-NEXT:    and.b64 %rd46, %rd45, 255;
-; CHECK-NEXT:    shl.b64 %rd47, %rd46, 24;
-; CHECK-NEXT:    or.b64 %rd48, %rd44, %rd47;
-; CHECK-NEXT:    cvt.u64.u16 %rd49, %rs7;
-; CHECK-NEXT:    and.b64 %rd50, %rd49, 255;
-; CHECK-NEXT:    shl.b64 %rd51, %rd50, 32;
-; CHECK-NEXT:    or.b64 %rd52, %rd48, %rd51;
-; CHECK-NEXT:    cvt.u64.u16 %rd53, %rs8;
-; CHECK-NEXT:    and.b64 %rd54, %rd53, 255;
-; CHECK-NEXT:    shl.b64 %rd55, %rd54, 40;
-; CHECK-NEXT:    or.b64 %rd56, %rd52, %rd55;
-; CHECK-NEXT:    cvt.u64.u16 %rd57, %rs9;
-; CHECK-NEXT:    and.b64 %rd58, %rd57, 255;
-; CHECK-NEXT:    shl.b64 %rd59, %rd58, 48;
-; CHECK-NEXT:    or.b64 %rd60, %rd56, %rd59;
-; CHECK-NEXT:    cvt.u64.u16 %rd61, %rs10;
-; CHECK-NEXT:    shl.b64 %rd62, %rd61, 56;
-; CHECK-NEXT:    or.b64 %rd63, %rd60, %rd62;
+; CHECK-NEXT:    shl.b64 %rd37, %rd36, 8;
+; CHECK-NEXT:    or.b64 %rd38, %rd34, %rd37;
+; CHECK-NEXT:    cvt.u64.u16 %rd39, %rs5;
+; CHECK-NEXT:    and.b64 %rd40, %rd39, 255;
+; CHECK-NEXT:    shl.b64 %rd41, %rd40, 16;
+; CHECK-NEXT:    or.b64 %rd42, %rd38, %rd41;
+; CHECK-NEXT:    cvt.u64.u16 %rd43, %rs6;
+; CHECK-NEXT:    and.b64 %rd44, %rd43, 255;
+; CHECK-NEXT:    shl.b64 %rd45, %rd44, 24;
+; CHECK-NEXT:    or.b64 %rd46, %rd42, %rd45;
+; CHECK-NEXT:    cvt.u64.u16 %rd47, %rs7;
+; CHECK-NEXT:    and.b64 %rd48, %rd47, 255;
+; CHECK-NEXT:    shl.b64 %rd49, %rd48, 32;
+; CHECK-NEXT:    or.b64 %rd50, %rd46, %rd49;
+; CHECK-NEXT:    cvt.u64.u16 %rd51, %rs8;
+; CHECK-NEXT:    and.b64 %rd52, %rd51, 255;
+; CHECK-NEXT:    shl.b64 %rd53, %rd52, 40;
+; CHECK-NEXT:    or.b64 %rd54, %rd50, %rd53;
+; CHECK-NEXT:    cvt.u64.u16 %rd55, %rs9;
+; CHECK-NEXT:    and.b64 %rd56, %rd55, 255;
+; CHECK-NEXT:    shl.b64 %rd57, %rd56, 48;
+; CHECK-NEXT:    or.b64 %rd58, %rd54, %rd57;
+; CHECK-NEXT:    cvt.u64.u16 %rd59, %rs10;
+; CHECK-NEXT:    shl.b64 %rd60, %rd59, 56;
+; CHECK-NEXT:    or.b64 %rd61, %rd58, %rd60;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd31;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs2;
-; CHECK-NEXT:    st.param.b8 [func_retval0+12], %rd45;
-; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd41;
-; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd37;
-; CHECK-NEXT:    st.param.b8 [func_retval0+9], %rd35;
-; CHECK-NEXT:    shr.u64 %rd64, %rd52, 32;
+; CHECK-NEXT:    st.param.b8 [func_retval0+12], %rd43;
+; CHECK-NEXT:    st.param.b8 [func_retval0+11], %rd39;
+; CHECK-NEXT:    st.param.b8 [func_retval0+10], %rd35;
+; CHECK-NEXT:    st.param.b8 [func_retval0+9], %rd33;
+; CHECK-NEXT:    shr.u64 %rd64, %rd50, 32;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+13], %rd64;
-; CHECK-NEXT:    shr.u64 %rd65, %rd56, 40;
+; CHECK-NEXT:    shr.u64 %rd65, %rd54, 40;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+14], %rd65;
-; CHECK-NEXT:    shr.u64 %rd66, %rd60, 48;
+; CHECK-NEXT:    shr.u64 %rd66, %rd58, 48;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+15], %rd66;
-; CHECK-NEXT:    shr.u64 %rd67, %rd63, 56;
+; CHECK-NEXT:    shr.u64 %rd67, %rd61, 56;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+16], %rd67;
 ; CHECK-NEXT:    st.param.b64 [func_retval0+24], %rd32;
 ; CHECK-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 167d7faafe5b3..ad2e7044e93bc 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -348,7 +348,7 @@ define dso_local void @qux() {
 ; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot7[24];
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
-; CHECK-PTX-NEXT:    .reg .b32 %r<3>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<2>;
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry



More information about the llvm-commits mailing list