[llvm] [NVPTX] Support for memory orderings for cmpxchg (PR #126159)

Akshay Deodhar via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 6 17:39:18 PST 2025


https://github.com/akshayrdeodhar updated https://github.com/llvm/llvm-project/pull/126159

>From 93bff73972f629367f41a8b24216410092233c8f Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Wed, 27 Nov 2024 03:43:49 +0000
Subject: [PATCH 1/3] [NVPTX] Support for memory orderings for cmpxchg

---
 llvm/include/llvm/CodeGen/TargetLowering.h  |    8 +
 llvm/lib/CodeGen/AtomicExpandPass.cpp       |    6 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp |   65 +
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h   |   11 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td    |  139 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h      |    2 +-
 llvm/test/CodeGen/NVPTX/atomics.ll          |    6 +-
 llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll     | 5680 +++++++++++++++++++
 llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll     | 5680 +++++++++++++++++++
 llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll     | 5680 +++++++++++++++++++
 llvm/test/CodeGen/NVPTX/cmpxchg.ll          | 1629 +++++-
 llvm/test/CodeGen/NVPTX/cmpxchg.py          |   46 +
 llvm/test/CodeGen/NVPTX/lit.local.cfg       |    2 +-
 13 files changed, 18815 insertions(+), 139 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg.py

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index bbecc7a6ddaee79..ac3233ec30ee3ea 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2173,6 +2173,14 @@ class TargetLoweringBase {
     return false;
   }
 
+  // The memory ordering that AtomicExpandPass should assign to a atomic
+  // instruction that it has lowered by adding fences. This can be used
+  // to "fold" one of the fences into the atomic instruction.
+  virtual AtomicOrdering
+  atomicOperationOrderAfterFenceSplit(const Instruction *I) const {
+    return AtomicOrdering::Monotonic;
+  }
+
   /// Whether AtomicExpandPass should automatically insert a trailing fence
   /// without reducing the ordering for this atomic. Defaults to false.
   virtual bool
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index a75fa688d87a8dd..a3e9700fa3089c8 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -324,8 +324,10 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
       // failure path. As a result, fence insertion is directly done by
       // expandAtomicCmpXchg in that case.
       FenceOrdering = CASI->getMergedOrdering();
-      CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
-      CASI->setFailureOrdering(AtomicOrdering::Monotonic);
+      auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+
+      CASI->setSuccessOrdering(CASOrdering);
+      CASI->setFailureOrdering(CASOrdering);
     }
 
     if (FenceOrdering != AtomicOrdering::Monotonic) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 58ad92a8934a66d..a74eada0afcebab 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/FPEnv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
@@ -49,6 +50,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Alignment.h"
+#include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
@@ -995,6 +997,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // actions
   computeRegisterProperties(STI.getRegisterInfo());
 
+  // PTX support for 16-bit CAS is emulated. Only use 32+
   setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
   setMaxAtomicSizeInBitsSupported(64);
   setMaxDivRemBitWidthSupported(64);
@@ -5565,6 +5568,68 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   return AtomicExpansionKind::CmpXChg;
 }
 
+bool NVPTXTargetLowering::shouldInsertFencesForAtomic(
+    const Instruction *I) const {
+  auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
+  // When CAS bitwidth is not supported on the hardware, the CAS is emulated
+  // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
+  // the memory order using explicit fences around the retry loop.
+  // The memory order of natively supported CAS operations can be enforced
+  // by lowering to an atom.cas with the right memory synchronizing effect.
+  // However, atom.cas only supports relaxed, acquire, release and acq_rel.
+  // So we also use explicit fences for enforcing memory order for
+  // seq_cast CAS with natively-supported bitwidths.
+  return CI &&
+         (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
+              STI.getMinCmpXchgSizeInBits() ||
+          CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
+}
+
+AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
+    const Instruction *I) const {
+  auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
+  bool BitwidthSupportedAndIsSeqCst =
+      CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
+      cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
+          STI.getMinCmpXchgSizeInBits();
+  return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
+                                      : AtomicOrdering::Monotonic;
+}
+
+Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
+                                                   Instruction *Inst,
+                                                   AtomicOrdering Ord) const {
+  // Specialize for cmpxchg
+  if (isa<AtomicCmpXchgInst>(Inst)) {
+    // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
+    if (isReleaseOrStronger(Ord))
+      return Ord == AtomicOrdering::SequentiallyConsistent ?
+             Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) :
+             Builder.CreateFence(AtomicOrdering::Release);
+  } else {
+    return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
+  }
+  return nullptr;
+}
+
+Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
+                                                    Instruction *Inst,
+                                                    AtomicOrdering Ord) const {
+  // Specialize for cmpxchg
+  if (isa<AtomicCmpXchgInst>(Inst)) {
+    auto CASWidth= cast<IntegerType>(dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())->getBitWidth();
+    // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
+    if (isAcquireOrStronger(Ord))
+      return (Ord == AtomicOrdering::SequentiallyConsistent
+              && CASWidth >= STI.getMinCmpXchgSizeInBits()) ?
+              nullptr :
+              Builder.CreateFence(AtomicOrdering::Acquire);
+  } else {
+    return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
+  }
+  return nullptr;
+}
+
 // Pin NVPTXTargetObjectFile's vtables to this file.
 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 5adf69d621552f3..1ee7a9b9ab8e36b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -17,6 +17,7 @@
 #include "NVPTX.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Support/AtomicOrdering.h"
 
 namespace llvm {
 namespace NVPTXISD {
@@ -260,6 +261,16 @@ class NVPTXTargetLowering : public TargetLowering {
     return true;
   }
 
+  bool shouldInsertFencesForAtomic(const Instruction *) const override;
+
+  AtomicOrdering
+  atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
+
+  Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+                                AtomicOrdering Ord) const override;
+  Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+                                 AtomicOrdering Ord) const override;
+
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 06c629c01d9ab09..14744411bc74bcd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1962,41 +1962,41 @@ multiclass F_ATOMIC_2_NEG<ValueType regT, NVPTXRegClass regclass, string SpaceSt
 
 // has 3 operands
 multiclass F_ATOMIC_3_imp<ValueType ptrT, NVPTXRegClass ptrclass,
-  ValueType regT, NVPTXRegClass regclass,
+  ValueType regT, NVPTXRegClass regclass, string SemStr,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
   Operand IMMType, list<Predicate> Pred> {
   let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
     def reg : NVPTXInst<(outs regclass:$dst),
       (ins ptrclass:$addr, regclass:$b, regclass:$c),
-      !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+      !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
       [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>,
     Requires<Pred>;
 
     def imm1 : NVPTXInst<(outs regclass:$dst),
       (ins ptrclass:$addr, IMMType:$b, regclass:$c),
-      !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+      !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
       [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>,
     Requires<Pred>;
 
     def imm2 : NVPTXInst<(outs regclass:$dst),
       (ins ptrclass:$addr, regclass:$b, IMMType:$c),
-      !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
+      !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
       [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>,
     Requires<Pred>;
 
     def imm3 : NVPTXInst<(outs regclass:$dst),
       (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
-      !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+      !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
       [(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>,
     Requires<Pred>;
   }
 }
-multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
-  string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
-  defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
-    IntOp, IMMType, Pred>;
-  defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
-    IntOp, IMMType, Pred>;
+multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SemStr, string SpaceStr,
+  string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
+  defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
+    OpcStr, IntOp, IMMType, Pred>;
+  defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
+    OpcStr, IntOp, IMMType, Pred>;
 }
 
 // atom_add
@@ -2378,51 +2378,86 @@ defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".xor",
 defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
   ".xor", atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>;
 
-// atom_cas
-
-def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
-  (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
-  (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
-  (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
-  (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
-  (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
-  (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
-  (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
-  (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
-  (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
-
-defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
+multiclass ternary_atomic_op_as {
+  // one record per address space
+  def NAME#_gen: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
+      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
+      AS_match.generic>;
+
+  def NAME#_g: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
+      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
+      AS_match.global>;
+
+  def NAME#_s: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
+      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
+      AS_match.shared>;
+}
+
+// generate pattern fragments for size x memory order
+// NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs
+// using larger-bitwidth cas
+foreach size = ["i16", "i32", "i64"] in {
+  foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in {
+    defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as;
+  }
+}
+
+multiclass INT_PTX_ATOM_CAS_AS<string atomic_cmp_swap_without_as, string type, string order> {
+  // eg. with type = 32, order = .acquire, atomic_cmp_swap_without_as = atomic_cmp_swap_i32_acquire, and _GEN becomes
+  // F_ATOMIC_3<i32, Int32Regs, ".acquire", ".b32", ".cas", atomic_cmp_swap_i32_acquire, i32imm>
+
+
+  // For SM70, PTX63 +, memory orders are supported
+  defm _GEN: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, "", ".b"#type, ".cas",
+                       !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+  defm _GEN_USE_G: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas",
+                       !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+  defm _G: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas",
+                       !cast<PatFrag>(atomic_cmp_swap_without_as#_g), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+  defm _S: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".shared", ".b"#type, ".cas",
+                       !cast<PatFrag>(atomic_cmp_swap_without_as#_s), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+  // For older archs, we fall back to lowering to relaxed atom.cas
+  defm _GEN_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", "", ".b"#type, ".cas",
+                       !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm")>;
+
+  defm _GEN_USE_G_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas",
+                       !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm")>;
+
+  defm _G_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas",
+                       !cast<PatFrag>(atomic_cmp_swap_without_as#_g), !cast<Operand>("i"#type#"imm")>;
+
+  defm _S_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".shared", ".b"#type, ".cas",
+                       !cast<PatFrag>(atomic_cmp_swap_without_as#_s), !cast<Operand>("i"#type#"imm")>;
+}
+
+// Define atom.cas for all combinations of size x memory order supported in PTX *and* on the hardware.
+foreach size = ["32", "64"] in {
+  // We enumerate the instructions with specific memory orders ["acquire", "release", "acq_rel"]
+  // *before* the instructions with the unspecified memory order [""]. 
+  // This ensures that `cmpxchg ... acquire` is lowered to "atom.acquire.cas" instead of "atom.cas"
+  // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
+  foreach order = ["acquire", "release", "acq_rel", ""] in {
+    if !eq(order, "") then {
+      defm INT_PTX_ATOM_CAS_#size : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, ".relaxed">;
+    } else {
+      defm INT_PTX_ATOM_CAS_#size#_#order : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "."#order>;
+    }
+  }
+}
+
+// Note that 16-bit CAS support in PTX is *emulated*. (TODO: public?)
+defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas",
   atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, "", ".shared", ".b16", ".cas",
   atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".b16", ".cas",
   atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas",
   atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
-  atomic_cmp_swap_i32_g, i32imm>;
-defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
-  atomic_cmp_swap_i32_s, i32imm>;
-defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<i32, Int32Regs, "", ".b32", ".cas",
-  atomic_cmp_swap_i32_gen, i32imm>;
-defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32",
-  ".cas", atomic_cmp_swap_i32_gen, i32imm>;
-defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64", ".cas",
-  atomic_cmp_swap_i64_g, i64imm>;
-defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<i64, Int64Regs, ".shared", ".b64", ".cas",
-  atomic_cmp_swap_i64_s, i64imm>;
-defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<i64, Int64Regs, "", ".b64", ".cas",
-  atomic_cmp_swap_i64_gen, i64imm>;
-defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64",
-  ".cas", atomic_cmp_swap_i64_gen, i64imm>;
 
 // Support for scoped atomic operations.  Matches
 // int_nvvm_atomic_{op}_{space}_{type}_{scope}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 851c9152e4cb8ff..f893f1aefec8485 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -147,7 +147,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   //  set of equivalent memory operations with a scalar data-type, executed in
   //  an unspecified order on the elements in the vector.
   unsigned getMaxRequiredAlignment() const { return 8; }
-  // Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS
+  // Get the smallest cmpxchg word size that the hardware supports.
   unsigned getMinCmpXchgSizeInBits() const { return 32; }
 
   unsigned getPTXVersion() const { return PTXVersion; }
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index fccc3a8844ffc4f..ac2bd8828b20579 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -177,7 +177,7 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
 
 ; CHECK-LABEL: atomicrmw_add_f16_generic
 define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
-; CHECK: atom.cas
+; CHECK: atom.acquire.cas
   %ret = atomicrmw fadd ptr %addr, half %val seq_cst
   ret half %ret
 }
@@ -198,14 +198,14 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
 
 ; CHECK-LABEL: atomic_cmpxchg_i32
 define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
-; CHECK: atom.cas.b32
+; CHECK: atom.acquire.cas.b32
   %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
   ret i32 %new
 }
 
 ; CHECK-LABEL: atomic_cmpxchg_i64
 define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) {
-; CHECK: atom.cas.b64
+; CHECK: atom.acquire.cas.b64
   %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
   ret i64 %new
 }
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
new file mode 100644
index 000000000000000..ea308c2a7673b7b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -0,0 +1,5680 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %}
+
+define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB0_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB0_1;
+; SM60-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+    ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB1_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB1_1;
+; SM60-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+    ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB2_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB2_1;
+; SM60-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+    ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB3_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB3_1;
+; SM60-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+    ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB4_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB4_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB4_1;
+; SM60-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+    ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB5_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB5_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB5_1;
+; SM60-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+    ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB6_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB6_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB6_1;
+; SM60-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+    ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB7_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB7_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB7_1;
+; SM60-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB8_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB8_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB8_1;
+; SM60-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB9_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB9_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB9_1;
+; SM60-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+    ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB10_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB10_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB10_1;
+; SM60-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+    ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB11_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB11_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB11_1;
+; SM60-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+    ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB12_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB12_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB12_1;
+; SM60-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+    ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB13_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB13_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB13_1;
+; SM60-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+    ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB14_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB14_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB14_1;
+; SM60-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+    ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB15_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB15_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB15_1;
+; SM60-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+    ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB16_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB16_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB16_1;
+; SM60-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+    ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB17_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB17_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB17_1;
+; SM60-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+    ret i8 %new
+}
+
+define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB18_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB18_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB18_1;
+; SM60-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+    ret i8 %new
+}
+
+define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB19_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB19_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB19_1;
+; SM60-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+    ret i8 %new
+}
+
+define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB20_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB20_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB20_1;
+; SM60-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB21_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB21_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB21_1;
+; SM60-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB22_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB22_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB22_1;
+; SM60-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB23_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB23_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB23_1;
+; SM60-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB24_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB24_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB24_1;
+; SM60-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB25_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB25_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB25_1;
+; SM60-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB26_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB26_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB26_1;
+; SM60-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB27_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB27_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB27_1;
+; SM60-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB28_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB28_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB28_1;
+; SM60-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB29_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB29_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB29_1;
+; SM60-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB30_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB30_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB30_1;
+; SM60-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB31_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB31_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB31_1;
+; SM60-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB32_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB32_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB32_1;
+; SM60-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB33_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB33_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB33_1;
+; SM60-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB34_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB34_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB34_1;
+; SM60-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB35_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB35_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB35_1;
+; SM60-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB36_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB36_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB36_1;
+; SM60-NEXT:  $L__BB36_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB37_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB37_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB37_1;
+; SM60-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB38_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB38_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB38_1;
+; SM60-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB39_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB39_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB39_1;
+; SM60-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB40_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB40_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB40_1;
+; SM60-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB41_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB41_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB41_1;
+; SM60-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB42_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB42_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB42_1;
+; SM60-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB43_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB43_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB43_1;
+; SM60-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<21>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT:    and.b32 %r10, %r9, 3;
+; SM60-NEXT:    shl.b32 %r1, %r10, 3;
+; SM60-NEXT:    mov.b32 %r11, 255;
+; SM60-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM60-NEXT:    not.b32 %r2, %r12;
+; SM60-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT:    and.b32 %r14, %r13, 255;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM60-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM60-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT:    and.b32 %r20, %r16, %r2;
+; SM60-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r17, %r20, %r3;
+; SM60-NEXT:    or.b32 %r18, %r20, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT:    @%p1 bra $L__BB44_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB44_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT:    mov.u32 %r20, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB44_1;
+; SM60-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB45_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB45_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB45_1;
+; SM60-NEXT:  $L__BB45_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB46_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB46_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB46_1;
+; SM60-NEXT:  $L__BB46_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB47_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB47_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB47_1;
+; SM60-NEXT:  $L__BB47_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB48_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB48_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB48_1;
+; SM60-NEXT:  $L__BB48_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB49_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB49_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB49_1;
+; SM60-NEXT:  $L__BB49_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB50_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB50_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB50_1;
+; SM60-NEXT:  $L__BB50_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB51_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB51_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB51_1;
+; SM60-NEXT:  $L__BB51_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB52_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB52_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB52_1;
+; SM60-NEXT:  $L__BB52_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB53_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB53_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB53_1;
+; SM60-NEXT:  $L__BB53_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB54_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB54_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB54_1;
+; SM60-NEXT:  $L__BB54_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB55_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB55_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB55_1;
+; SM60-NEXT:  $L__BB55_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB56_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB56_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB56_1;
+; SM60-NEXT:  $L__BB56_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB57_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB57_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB57_1;
+; SM60-NEXT:  $L__BB57_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB58_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB58_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB58_1;
+; SM60-NEXT:  $L__BB58_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB59_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB59_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB59_1;
+; SM60-NEXT:  $L__BB59_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB60_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB60_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB60_1;
+; SM60-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB61_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB61_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB61_1;
+; SM60-NEXT:  $L__BB61_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB62_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB62_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB62_1;
+; SM60-NEXT:  $L__BB62_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB63_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB63_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB63_1;
+; SM60-NEXT:  $L__BB63_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB64_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB64_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB64_1;
+; SM60-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB65_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB65_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB65_1;
+; SM60-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB66_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB66_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB66_1;
+; SM60-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB67_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB67_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB67_1;
+; SM60-NEXT:  $L__BB67_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB68_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB68_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB68_1;
+; SM60-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB69_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB69_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB69_1;
+; SM60-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB70_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB70_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB70_1;
+; SM60-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB71_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB71_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB71_1;
+; SM60-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB72_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB72_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB72_1;
+; SM60-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB73_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB73_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB73_1;
+; SM60-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB74_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB74_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB74_1;
+; SM60-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB75_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB75_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB75_1;
+; SM60-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB76_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB76_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB76_1;
+; SM60-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB77_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB77_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB77_1;
+; SM60-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB78_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB78_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB78_1;
+; SM60-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB79_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB79_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB79_1;
+; SM60-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB80_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB80_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB80_1;
+; SM60-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB81_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB81_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB81_1;
+; SM60-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB82_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB82_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB82_1;
+; SM60-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB83_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB83_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB83_1;
+; SM60-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB84_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB84_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB84_1;
+; SM60-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB85_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB85_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB85_1;
+; SM60-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB86_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB86_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB86_1;
+; SM60-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB87_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB87_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB87_1;
+; SM60-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_global(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB88_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB88_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB88_1;
+; SM60-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .pred %p<3>;
+; SM60-NEXT:    .reg .b16 %rs<2>;
+; SM60-NEXT:    .reg .b32 %r<20>;
+; SM60-NEXT:    .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM60-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM60-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT:    and.b32 %r11, %r10, 3;
+; SM60-NEXT:    shl.b32 %r1, %r11, 3;
+; SM60-NEXT:    mov.b32 %r12, 65535;
+; SM60-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM60-NEXT:    not.b32 %r2, %r13;
+; SM60-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM60-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM60-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT:    and.b32 %r19, %r15, %r2;
+; SM60-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
+; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM60-NEXT:    or.b32 %r16, %r19, %r3;
+; SM60-NEXT:    or.b32 %r17, %r19, %r4;
+; SM60-NEXT:    atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT:    @%p1 bra $L__BB89_3;
+; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT:    // in Loop: Header=BB89_1 Depth=1
+; SM60-NEXT:    and.b32 %r8, %r7, %r2;
+; SM60-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT:    mov.u32 %r19, %r8;
+; SM60-NEXT:    @%p2 bra $L__BB89_1;
+; SM60-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+    ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [release_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [release_acquire_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
+; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
+; SM60-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
+; SM60-NEXT:    atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b32 %r<4>;
+; SM60-NEXT:    .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
+; SM60-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+    ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+    ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+    ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+    ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+    ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+    ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+    ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+    ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+    ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+    ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_generic(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
+; SM60-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_global(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
+; SM60-NEXT:    atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_shared(
+; SM60:       {
+; SM60-NEXT:    .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
+; SM60-NEXT:    membar.sys;
+; SM60-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
+; SM60-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM60-NEXT:    atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    ret i64 %new
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
new file mode 100644
index 000000000000000..4360ea36e863ab2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
@@ -0,0 +1,5680 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+
+define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB0_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB0_1;
+; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+    ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB1_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB1_1;
+; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+    ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB2_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB2_1;
+; SM70-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+    ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB3_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB3_1;
+; SM70-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+    ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB4_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB4_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB4_1;
+; SM70-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+    ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB5_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB5_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB5_1;
+; SM70-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+    ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB6_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB6_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB6_1;
+; SM70-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+    ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB7_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB7_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB7_1;
+; SM70-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB8_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB8_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB8_1;
+; SM70-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB9_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB9_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB9_1;
+; SM70-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+    ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB10_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB10_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB10_1;
+; SM70-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+    ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB11_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB11_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB11_1;
+; SM70-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+    ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB12_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB12_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB12_1;
+; SM70-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+    ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB13_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB13_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB13_1;
+; SM70-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+    ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB14_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB14_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB14_1;
+; SM70-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+    ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB15_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB15_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB15_1;
+; SM70-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+    ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB16_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB16_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB16_1;
+; SM70-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+    ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB17_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB17_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB17_1;
+; SM70-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+    ret i8 %new
+}
+
+define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB18_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB18_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB18_1;
+; SM70-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+    ret i8 %new
+}
+
+define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB19_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB19_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB19_1;
+; SM70-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+    ret i8 %new
+}
+
+define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB20_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB20_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB20_1;
+; SM70-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB21_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB21_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB21_1;
+; SM70-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB22_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB22_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB22_1;
+; SM70-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB23_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB23_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB23_1;
+; SM70-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB24_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB24_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB24_1;
+; SM70-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB25_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB25_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB25_1;
+; SM70-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB26_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB26_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB26_1;
+; SM70-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB27_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB27_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB27_1;
+; SM70-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB28_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB28_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB28_1;
+; SM70-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB29_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB29_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB29_1;
+; SM70-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB30_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB30_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB30_1;
+; SM70-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB31_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB31_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB31_1;
+; SM70-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB32_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB32_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB32_1;
+; SM70-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB33_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB33_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB33_1;
+; SM70-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB34_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB34_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB34_1;
+; SM70-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB35_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB35_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB35_1;
+; SM70-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB36_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB36_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB36_1;
+; SM70-NEXT:  $L__BB36_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB37_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB37_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB37_1;
+; SM70-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB38_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB38_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB38_1;
+; SM70-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB39_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB39_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB39_1;
+; SM70-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB40_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB40_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB40_1;
+; SM70-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB41_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB41_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB41_1;
+; SM70-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB42_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB42_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB42_1;
+; SM70-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB43_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB43_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB43_1;
+; SM70-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB44_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB44_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB44_1;
+; SM70-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB45_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB45_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB45_1;
+; SM70-NEXT:  $L__BB45_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB46_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB46_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB46_1;
+; SM70-NEXT:  $L__BB46_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB47_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB47_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB47_1;
+; SM70-NEXT:  $L__BB47_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB48_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB48_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB48_1;
+; SM70-NEXT:  $L__BB48_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB49_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB49_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB49_1;
+; SM70-NEXT:  $L__BB49_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB50_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB50_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB50_1;
+; SM70-NEXT:  $L__BB50_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB51_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB51_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB51_1;
+; SM70-NEXT:  $L__BB51_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB52_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB52_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB52_1;
+; SM70-NEXT:  $L__BB52_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB53_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB53_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB53_1;
+; SM70-NEXT:  $L__BB53_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB54_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB54_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB54_1;
+; SM70-NEXT:  $L__BB54_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB55_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB55_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB55_1;
+; SM70-NEXT:  $L__BB55_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB56_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB56_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB56_1;
+; SM70-NEXT:  $L__BB56_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB57_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB57_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB57_1;
+; SM70-NEXT:  $L__BB57_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB58_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB58_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB58_1;
+; SM70-NEXT:  $L__BB58_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB59_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB59_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB59_1;
+; SM70-NEXT:  $L__BB59_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB60_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB60_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB60_1;
+; SM70-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB61_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB61_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB61_1;
+; SM70-NEXT:  $L__BB61_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB62_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB62_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB62_1;
+; SM70-NEXT:  $L__BB62_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB63_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB63_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB63_1;
+; SM70-NEXT:  $L__BB63_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB64_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB64_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB64_1;
+; SM70-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB65_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB65_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB65_1;
+; SM70-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB66_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB66_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB66_1;
+; SM70-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB67_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB67_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB67_1;
+; SM70-NEXT:  $L__BB67_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB68_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB68_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB68_1;
+; SM70-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB69_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB69_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB69_1;
+; SM70-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB70_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB70_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB70_1;
+; SM70-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB71_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB71_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB71_1;
+; SM70-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB72_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB72_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB72_1;
+; SM70-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB73_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB73_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB73_1;
+; SM70-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB74_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB74_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB74_1;
+; SM70-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB75_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB75_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB75_1;
+; SM70-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB76_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB76_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB76_1;
+; SM70-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB77_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB77_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB77_1;
+; SM70-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB78_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB78_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB78_1;
+; SM70-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB79_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB79_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB79_1;
+; SM70-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB80_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB80_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB80_1;
+; SM70-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB81_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB81_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB81_1;
+; SM70-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB82_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB82_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB82_1;
+; SM70-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB83_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB83_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB83_1;
+; SM70-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB84_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB84_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB84_1;
+; SM70-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB85_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB85_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB85_1;
+; SM70-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB86_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB86_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB86_1;
+; SM70-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB87_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB87_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB87_1;
+; SM70-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_global(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB88_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB88_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB88_1;
+; SM70-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB89_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB89_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB89_1;
+; SM70-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+    ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
+; SM70-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
+; SM70-NEXT:    atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
+; SM70-NEXT:    atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
+; SM70-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
+; SM70-NEXT:    atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
+; SM70-NEXT:    atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
+; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [release_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [release_acquire_i32_global_param_2];
+; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
+; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
+; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
+; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
+; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
+; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM70-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
+; SM70-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+    ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM70-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
+; SM70-NEXT:    atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
+; SM70-NEXT:    atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM70-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
+; SM70-NEXT:    atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
+; SM70-NEXT:    atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
+; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
+; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
+; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+    ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+    ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
+; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
+; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
+; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+    ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM70-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+    ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
+; SM70-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+    ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+    ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+    ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+    ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+    ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_generic(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_global(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
+; SM70-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_shared(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM70-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    ret i64 %new
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
new file mode 100644
index 000000000000000..5acb275a6f5810b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
@@ -0,0 +1,5680 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
+
+define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB0_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB0_1;
+; SM90-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+    ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB1_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB1_1;
+; SM90-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+    ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB2_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB2_1;
+; SM90-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+    ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB3_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB3_1;
+; SM90-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+    ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB4_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB4_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB4_1;
+; SM90-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+    ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB5_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB5_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB5_1;
+; SM90-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+    ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB6_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB6_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB6_1;
+; SM90-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+    ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB7_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB7_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB7_1;
+; SM90-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB8_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB8_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB8_1;
+; SM90-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+    ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB9_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB9_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB9_1;
+; SM90-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+    ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB10_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB10_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB10_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB10_1;
+; SM90-NEXT:  $L__BB10_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+    ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB11_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB11_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB11_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB11_1;
+; SM90-NEXT:  $L__BB11_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+    ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB12_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB12_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB12_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB12_1;
+; SM90-NEXT:  $L__BB12_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+    ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB13_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB13_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB13_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB13_1;
+; SM90-NEXT:  $L__BB13_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+    ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB14_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB14_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB14_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB14_1;
+; SM90-NEXT:  $L__BB14_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+    ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB15_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB15_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB15_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB15_1;
+; SM90-NEXT:  $L__BB15_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+    ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB16_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB16_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB16_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB16_1;
+; SM90-NEXT:  $L__BB16_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+    ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB17_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB17_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB17_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB17_1;
+; SM90-NEXT:  $L__BB17_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+    ret i8 %new
+}
+
+define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB18_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB18_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB18_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB18_1;
+; SM90-NEXT:  $L__BB18_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+    ret i8 %new
+}
+
+define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB19_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB19_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB19_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB19_1;
+; SM90-NEXT:  $L__BB19_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+    ret i8 %new
+}
+
+define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB20_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB20_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB20_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB20_1;
+; SM90-NEXT:  $L__BB20_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB21_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB21_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB21_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB21_1;
+; SM90-NEXT:  $L__BB21_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB22_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB22_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB22_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB22_1;
+; SM90-NEXT:  $L__BB22_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+    ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB23_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB23_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB23_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB23_1;
+; SM90-NEXT:  $L__BB23_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB24_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB24_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB24_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB24_1;
+; SM90-NEXT:  $L__BB24_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB25_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB25_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB25_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB25_1;
+; SM90-NEXT:  $L__BB25_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+    ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB26_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB26_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB26_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB26_1;
+; SM90-NEXT:  $L__BB26_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB27_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB27_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB27_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB27_1;
+; SM90-NEXT:  $L__BB27_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB28_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB28_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB28_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB28_1;
+; SM90-NEXT:  $L__BB28_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB29_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB29_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB29_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB29_1;
+; SM90-NEXT:  $L__BB29_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB30_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB30_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB30_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB30_1;
+; SM90-NEXT:  $L__BB30_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB31_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB31_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB31_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB31_1;
+; SM90-NEXT:  $L__BB31_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB32_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB32_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB32_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB32_1;
+; SM90-NEXT:  $L__BB32_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB33_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB33_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB33_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB33_1;
+; SM90-NEXT:  $L__BB33_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB34_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB34_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB34_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB34_1;
+; SM90-NEXT:  $L__BB34_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB35_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB35_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB35_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB35_1;
+; SM90-NEXT:  $L__BB35_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB36_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB36_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB36_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB36_1;
+; SM90-NEXT:  $L__BB36_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB37_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB37_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB37_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB37_1;
+; SM90-NEXT:  $L__BB37_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB38_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB38_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB38_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB38_1;
+; SM90-NEXT:  $L__BB38_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB39_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB39_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB39_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB39_1;
+; SM90-NEXT:  $L__BB39_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB40_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB40_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB40_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB40_1;
+; SM90-NEXT:  $L__BB40_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB41_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB41_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB41_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB41_1;
+; SM90-NEXT:  $L__BB41_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB42_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB42_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB42_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB42_1;
+; SM90-NEXT:  $L__BB42_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.global.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB43_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB43_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB43_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB43_1;
+; SM90-NEXT:  $L__BB43_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB44_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB44_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB44_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB44_1;
+; SM90-NEXT:  $L__BB44_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+    ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB45_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB45_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB45_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB45_1;
+; SM90-NEXT:  $L__BB45_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB46_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB46_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB46_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB46_1;
+; SM90-NEXT:  $L__BB46_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB47_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB47_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB47_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB47_1;
+; SM90-NEXT:  $L__BB47_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB48_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB48_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB48_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB48_1;
+; SM90-NEXT:  $L__BB48_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB49_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB49_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB49_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB49_1;
+; SM90-NEXT:  $L__BB49_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB50_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB50_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB50_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB50_1;
+; SM90-NEXT:  $L__BB50_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB51_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB51_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB51_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB51_1;
+; SM90-NEXT:  $L__BB51_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB52_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB52_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB52_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB52_1;
+; SM90-NEXT:  $L__BB52_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB53_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB53_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB53_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB53_1;
+; SM90-NEXT:  $L__BB53_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB54_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB54_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB54_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB54_1;
+; SM90-NEXT:  $L__BB54_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB55_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB55_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB55_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB55_1;
+; SM90-NEXT:  $L__BB55_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB56_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB56_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB56_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB56_1;
+; SM90-NEXT:  $L__BB56_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB57_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB57_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB57_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB57_1;
+; SM90-NEXT:  $L__BB57_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB58_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB58_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB58_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB58_1;
+; SM90-NEXT:  $L__BB58_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB59_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB59_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB59_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB59_1;
+; SM90-NEXT:  $L__BB59_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB60_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB60_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB60_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB60_1;
+; SM90-NEXT:  $L__BB60_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB61_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB61_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB61_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB61_1;
+; SM90-NEXT:  $L__BB61_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB62_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB62_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB62_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB62_1;
+; SM90-NEXT:  $L__BB62_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB63_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB63_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB63_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB63_1;
+; SM90-NEXT:  $L__BB63_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB64_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB64_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB64_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB64_1;
+; SM90-NEXT:  $L__BB64_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
+    ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB65_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB65_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB65_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB65_1;
+; SM90-NEXT:  $L__BB65_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB66_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB66_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB66_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB66_1;
+; SM90-NEXT:  $L__BB66_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB67_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB67_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB67_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB67_1;
+; SM90-NEXT:  $L__BB67_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
+    ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB68_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB68_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB68_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB68_1;
+; SM90-NEXT:  $L__BB68_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB69_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB69_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB69_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB69_1;
+; SM90-NEXT:  $L__BB69_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB70_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB70_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB70_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB70_1;
+; SM90-NEXT:  $L__BB70_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
+    ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB71_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB71_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB71_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB71_1;
+; SM90-NEXT:  $L__BB71_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB72_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB72_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB72_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB72_1;
+; SM90-NEXT:  $L__BB72_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB73_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB73_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB73_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB73_1;
+; SM90-NEXT:  $L__BB73_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB74_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB74_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB74_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB74_1;
+; SM90-NEXT:  $L__BB74_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB75_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB75_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB75_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB75_1;
+; SM90-NEXT:  $L__BB75_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB76_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB76_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB76_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB76_1;
+; SM90-NEXT:  $L__BB76_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB77_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB77_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB77_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB77_1;
+; SM90-NEXT:  $L__BB77_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB78_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB78_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB78_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB78_1;
+; SM90-NEXT:  $L__BB78_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB79_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB79_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB79_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB79_1;
+; SM90-NEXT:  $L__BB79_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB80_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB80_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB80_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB80_1;
+; SM90-NEXT:  $L__BB80_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB81_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB81_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB81_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB81_1;
+; SM90-NEXT:  $L__BB81_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB82_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB82_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB82_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB82_1;
+; SM90-NEXT:  $L__BB82_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB83_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB83_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB83_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB83_1;
+; SM90-NEXT:  $L__BB83_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB84_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB84_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB84_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB84_1;
+; SM90-NEXT:  $L__BB84_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB85_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB85_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB85_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB85_1;
+; SM90-NEXT:  $L__BB85_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB86_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB86_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB86_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB86_1;
+; SM90-NEXT:  $L__BB86_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB87_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB87_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB87_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB87_1;
+; SM90-NEXT:  $L__BB87_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.global.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB88_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB88_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB88_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB88_1;
+; SM90-NEXT:  $L__BB88_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+    ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB89_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB89_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB89_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB89_1;
+; SM90-NEXT:  $L__BB89_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+    ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
+; SM90-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
+; SM90-NEXT:    atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
+; SM90-NEXT:    atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
+; SM90-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
+; SM90-NEXT:    atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
+    ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
+; SM90-NEXT:    atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
+; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_global_param_2];
+; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
+    ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
+; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
+    ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
+; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
+; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
+; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
+; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM90-NEXT:    atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
+; SM90-NEXT:    atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+    ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+    ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM90-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
+; SM90-NEXT:    atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
+; SM90-NEXT:    atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM90-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
+; SM90-NEXT:    atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+    ret i64 %new
+}
+
+define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
+; SM90-NEXT:    atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
+; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
+; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+    ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
+; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+    ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+    ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+    ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
+; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
+; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+    ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
+; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+    ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM90-NEXT:    atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+    ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
+; SM90-NEXT:    atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+    ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+    ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+    ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+    ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+    ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+    ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+    ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_generic(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
+; SM90-NEXT:    atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_shared(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM90-NEXT:    atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+    %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+    ret i64 %new
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index 33a1f15c6a5cd69..aaea0d2ee25efb3 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -7,6 +7,7 @@
 ; TODO: these are system scope, but are compiled to gpu scope..
 ; TODO: these are seq_cst, but are compiled to relaxed..
 
+
 ; CHECK-LABEL: relaxed_sys_i8
 define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-LABEL: relaxed_sys_i8(
@@ -17,86 +18,1153 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM30-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT:    and.b32 %r10, %r9, 3;
+; SM30-NEXT:    shl.b32 %r1, %r10, 3;
+; SM30-NEXT:    mov.b32 %r11, 255;
+; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM30-NEXT:    not.b32 %r2, %r12;
+; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT:    and.b32 %r14, %r13, 255;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r17, %r20, %r3;
+; SM30-NEXT:    or.b32 %r18, %r20, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT:    @%p1 bra $L__BB0_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT:    mov.u32 %r20, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB0_1;
+; SM30-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: relaxed_sys_i8(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB0_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB0_1;
+; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+; SM90-LABEL: relaxed_sys_i8(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB0_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB0_1;
+; SM90-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+  ret i8 %new
+}
+
+define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: acquire_sys_i8(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u8 %rs1, [acquire_sys_i8_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [acquire_sys_i8_param_0];
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT:    and.b32 %r10, %r9, 3;
+; SM30-NEXT:    shl.b32 %r1, %r10, 3;
+; SM30-NEXT:    mov.b32 %r11, 255;
+; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM30-NEXT:    not.b32 %r2, %r12;
+; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT:    and.b32 %r14, %r13, 255;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    ld.param.u8 %r15, [acquire_sys_i8_param_1];
+; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r17, %r20, %r3;
+; SM30-NEXT:    or.b32 %r18, %r20, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT:    @%p1 bra $L__BB1_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT:    mov.u32 %r20, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB1_1;
+; SM30-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: acquire_sys_i8(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acquire_sys_i8_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_sys_i8_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acquire_sys_i8_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB1_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB1_1;
+; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+; SM90-LABEL: acquire_sys_i8(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acquire_sys_i8_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_sys_i8_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acquire_sys_i8_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB1_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB1_1;
+; SM90-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+  ret i8 %new
+}
+
+define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: release_sys_i8(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u8 %rs1, [release_sys_i8_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [release_sys_i8_param_0];
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT:    and.b32 %r10, %r9, 3;
+; SM30-NEXT:    shl.b32 %r1, %r10, 3;
+; SM30-NEXT:    mov.b32 %r11, 255;
+; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM30-NEXT:    not.b32 %r2, %r12;
+; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT:    and.b32 %r14, %r13, 255;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    ld.param.u8 %r15, [release_sys_i8_param_1];
+; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r17, %r20, %r3;
+; SM30-NEXT:    or.b32 %r18, %r20, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT:    @%p1 bra $L__BB2_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT:    mov.u32 %r20, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB2_1;
+; SM30-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
+; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: release_sys_i8(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [release_sys_i8_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_sys_i8_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [release_sys_i8_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB2_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB2_1;
+; SM70-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+; SM90-LABEL: release_sys_i8(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [release_sys_i8_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_sys_i8_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [release_sys_i8_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB2_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB2_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB2_1;
+; SM90-NEXT:  $L__BB2_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+  ret i8 %new
+}
+
+define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: acq_rel_sys_i8(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT:    and.b32 %r10, %r9, 3;
+; SM30-NEXT:    shl.b32 %r1, %r10, 3;
+; SM30-NEXT:    mov.b32 %r11, 255;
+; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM30-NEXT:    not.b32 %r2, %r12;
+; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT:    and.b32 %r14, %r13, 255;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
+; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r17, %r20, %r3;
+; SM30-NEXT:    or.b32 %r18, %r20, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT:    @%p1 bra $L__BB3_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT:    mov.u32 %r20, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB3_1;
+; SM30-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: acq_rel_sys_i8(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB3_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB3_1;
+; SM70-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+; SM90-LABEL: acq_rel_sys_i8(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB3_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB3_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB3_1;
+; SM90-NEXT:  $L__BB3_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+  ret i8 %new
+}
+
+define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: seq_cst_sys_i8(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<21>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT:    and.b32 %r10, %r9, 3;
+; SM30-NEXT:    shl.b32 %r1, %r10, 3;
+; SM30-NEXT:    mov.b32 %r11, 255;
+; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM30-NEXT:    not.b32 %r2, %r12;
+; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT:    and.b32 %r14, %r13, 255;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
+; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM30-NEXT:    ld.u32 %r16, [%rd1];
+; SM30-NEXT:    and.b32 %r20, %r16, %r2;
+; SM30-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r17, %r20, %r3;
+; SM30-NEXT:    or.b32 %r18, %r20, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT:    @%p1 bra $L__BB4_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB4_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT:    mov.u32 %r20, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB4_1;
+; SM30-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: seq_cst_sys_i8(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
+; SM70-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT:    @%p1 bra $L__BB4_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB4_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB4_1;
+; SM70-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    ret;
+; SM90-LABEL: seq_cst_sys_i8(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<21>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT:    and.b32 %r10, %r9, 3;
+; SM90-NEXT:    shl.b32 %r1, %r10, 3;
+; SM90-NEXT:    mov.b32 %r11, 255;
+; SM90-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM90-NEXT:    not.b32 %r2, %r12;
+; SM90-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT:    and.b32 %r14, %r13, 255;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
+; SM90-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM90-NEXT:    ld.u32 %r16, [%rd1];
+; SM90-NEXT:    and.b32 %r20, %r16, %r2;
+; SM90-NEXT:  $L__BB4_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r17, %r20, %r3;
+; SM90-NEXT:    or.b32 %r18, %r20, %r4;
+; SM90-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT:    @%p1 bra $L__BB4_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB4_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT:    mov.u32 %r20, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB4_1;
+; SM90-NEXT:  $L__BB4_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+  ret i8 %new
+}
+
+; CHECK-LABEL: relaxed_sys_i16
+define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: relaxed_sys_i16(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<20>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT:    and.b32 %r11, %r10, 3;
+; SM30-NEXT:    shl.b32 %r1, %r11, 3;
+; SM30-NEXT:    mov.b32 %r12, 65535;
+; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM30-NEXT:    not.b32 %r2, %r13;
+; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM30-NEXT:    ld.u32 %r15, [%rd1];
+; SM30-NEXT:    and.b32 %r19, %r15, %r2;
+; SM30-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r16, %r19, %r3;
+; SM30-NEXT:    or.b32 %r17, %r19, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM30-NEXT:    @%p1 bra $L__BB5_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB5_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM30-NEXT:    mov.u32 %r19, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB5_1;
+; SM30-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
+; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: relaxed_sys_i16(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB5_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB5_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB5_1;
+; SM70-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+; SM90-LABEL: relaxed_sys_i16(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB5_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB5_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB5_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB5_1;
+; SM90-NEXT:  $L__BB5_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
+  ret i16 %new
+}
+
+define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: acquire_sys_i16(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<20>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u16 %rs1, [acquire_sys_i16_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [acquire_sys_i16_param_0];
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    ld.param.u16 %r9, [acquire_sys_i16_param_1];
+; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT:    and.b32 %r11, %r10, 3;
+; SM30-NEXT:    shl.b32 %r1, %r11, 3;
+; SM30-NEXT:    mov.b32 %r12, 65535;
+; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM30-NEXT:    not.b32 %r2, %r13;
+; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM30-NEXT:    ld.u32 %r15, [%rd1];
+; SM30-NEXT:    and.b32 %r19, %r15, %r2;
+; SM30-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r16, %r19, %r3;
+; SM30-NEXT:    or.b32 %r17, %r19, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM30-NEXT:    @%p1 bra $L__BB6_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB6_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM30-NEXT:    mov.u32 %r19, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB6_1;
+; SM30-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: acquire_sys_i16(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [acquire_sys_i16_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_sys_i16_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [acquire_sys_i16_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB6_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB6_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB6_1;
+; SM70-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+; SM90-LABEL: acquire_sys_i16(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acquire_sys_i16_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_sys_i16_param_0];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    ld.param.u16 %r9, [acquire_sys_i16_param_1];
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB6_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB6_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB6_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB6_1;
+; SM90-NEXT:  $L__BB6_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
+  ret i16 %new
+}
+
+define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: release_sys_i16(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<20>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u16 %rs1, [release_sys_i16_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [release_sys_i16_param_0];
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ld.param.u16 %r9, [release_sys_i16_param_1];
+; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT:    and.b32 %r11, %r10, 3;
+; SM30-NEXT:    shl.b32 %r1, %r11, 3;
+; SM30-NEXT:    mov.b32 %r12, 65535;
+; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM30-NEXT:    not.b32 %r2, %r13;
+; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM30-NEXT:    ld.u32 %r15, [%rd1];
+; SM30-NEXT:    and.b32 %r19, %r15, %r2;
+; SM30-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM30-NEXT:    or.b32 %r16, %r19, %r3;
+; SM30-NEXT:    or.b32 %r17, %r19, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM30-NEXT:    @%p1 bra $L__BB7_3;
+; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT:    // in Loop: Header=BB7_1 Depth=1
+; SM30-NEXT:    and.b32 %r8, %r7, %r2;
+; SM30-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM30-NEXT:    mov.u32 %r19, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB7_1;
+; SM30-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
+; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: release_sys_i16(
+; SM70:       {
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u16 %rs1, [release_sys_i16_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_sys_i16_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [release_sys_i16_param_1];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB7_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB7_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB7_1;
+; SM70-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM70-NEXT:    ret;
+; SM90-LABEL: release_sys_i16(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [release_sys_i16_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_sys_i16_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [release_sys_i16_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB7_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB7_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB7_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB7_1;
+; SM90-NEXT:  $L__BB7_3: // %partword.cmpxchg.end
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
+  ret i16 %new
+}
+
+define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: acq_rel_sys_i16(
+; SM30:       {
+; SM30-NEXT:    .reg .pred %p<3>;
+; SM30-NEXT:    .reg .b16 %rs<2>;
+; SM30-NEXT:    .reg .b32 %r<20>;
+; SM30-NEXT:    .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM30-NEXT:    and.b32 %r10, %r9, 3;
-; SM30-NEXT:    shl.b32 %r1, %r10, 3;
-; SM30-NEXT:    mov.b32 %r11, 255;
-; SM30-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM30-NEXT:    not.b32 %r2, %r12;
-; SM30-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM30-NEXT:    and.b32 %r14, %r13, 255;
+; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT:    and.b32 %r11, %r10, 3;
+; SM30-NEXT:    shl.b32 %r1, %r11, 3;
+; SM30-NEXT:    mov.b32 %r12, 65535;
+; SM30-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM30-NEXT:    not.b32 %r2, %r13;
+; SM30-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM30-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM30-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
-; SM30-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM30-NEXT:    ld.u32 %r16, [%rd1];
-; SM30-NEXT:    and.b32 %r20, %r16, %r2;
-; SM30-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM30-NEXT:    ld.u32 %r15, [%rd1];
+; SM30-NEXT:    and.b32 %r19, %r15, %r2;
+; SM30-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM30-NEXT:    or.b32 %r17, %r20, %r3;
-; SM30-NEXT:    or.b32 %r18, %r20, %r4;
-; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r18;
-; SM30-NEXT:    @%p1 bra $L__BB0_3;
+; SM30-NEXT:    or.b32 %r16, %r19, %r3;
+; SM30-NEXT:    or.b32 %r17, %r19, %r4;
+; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM30-NEXT:    @%p1 bra $L__BB8_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM30-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM30-NEXT:    // in Loop: Header=BB8_1 Depth=1
 ; SM30-NEXT:    and.b32 %r8, %r7, %r2;
-; SM30-NEXT:    setp.ne.s32 %p2, %r20, %r8;
-; SM30-NEXT:    mov.u32 %r20, %r8;
-; SM30-NEXT:    @%p2 bra $L__BB0_1;
-; SM30-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM30-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM30-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM30-NEXT:    mov.u32 %r19, %r8;
+; SM30-NEXT:    @%p2 bra $L__BB8_1;
+; SM30-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM30-NEXT:    ret;
 ;
-; SM70-LABEL: relaxed_sys_i8(
+; SM70-LABEL: acq_rel_sys_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
-; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b32 %r<20>;
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM70-NEXT:    ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT:    and.b32 %r10, %r9, 3;
-; SM70-NEXT:    shl.b32 %r1, %r10, 3;
-; SM70-NEXT:    mov.b32 %r11, 255;
-; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
-; SM70-NEXT:    not.b32 %r2, %r12;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
 ; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
-; SM70-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
-; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
-; SM70-NEXT:    ld.u32 %r16, [%rd1];
-; SM70-NEXT:    and.b32 %r20, %r16, %r2;
-; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b32 %r17, %r20, %r3;
-; SM70-NEXT:    or.b32 %r18, %r20, %r4;
-; SM70-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
-; SM70-NEXT:    @%p1 bra $L__BB0_3;
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB8_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB8_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
-; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
-; SM70-NEXT:    mov.u32 %r20, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB0_1;
-; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB8_1;
+; SM70-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
-  ret i8 %new
+; SM90-LABEL: acq_rel_sys_i16(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
+; SM90-NEXT:    fence.release.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB8_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB8_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB8_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB8_1;
+; SM90-NEXT:  $L__BB8_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
+  ret i16 %new
 }
 
-; CHECK-LABEL: relaxed_sys_i16
-define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
-; SM30-LABEL: relaxed_sys_i16(
+; CHECK-LABEL: seq_cst_sys_i16
+define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: seq_cst_sys_i16(
 ; SM30:       {
 ; SM30-NEXT:    .reg .pred %p<3>;
 ; SM30-NEXT:    .reg .b16 %rs<2>;
@@ -104,10 +1172,11 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    .reg .b64 %rd<3>;
 ; SM30-EMPTY:
 ; SM30-NEXT:  // %bb.0:
-; SM30-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
-; SM30-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM30-NEXT:    ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
+; SM30-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
 ; SM30-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM30-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
 ; SM30-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM30-NEXT:    and.b32 %r11, %r10, 3;
 ; SM30-NEXT:    shl.b32 %r1, %r11, 3;
@@ -119,24 +1188,25 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM30-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM30-NEXT:    ld.u32 %r15, [%rd1];
 ; SM30-NEXT:    and.b32 %r19, %r15, %r2;
-; SM30-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
+; SM30-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM30-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM30-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM30-NEXT:    or.b32 %r17, %r19, %r4;
 ; SM30-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM30-NEXT:    setp.eq.s32 %p1, %r7, %r17;
-; SM30-NEXT:    @%p1 bra $L__BB1_3;
+; SM30-NEXT:    @%p1 bra $L__BB9_3;
 ; SM30-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM30-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; SM30-NEXT:    // in Loop: Header=BB9_1 Depth=1
 ; SM30-NEXT:    and.b32 %r8, %r7, %r2;
 ; SM30-NEXT:    setp.ne.s32 %p2, %r19, %r8;
 ; SM30-NEXT:    mov.u32 %r19, %r8;
-; SM30-NEXT:    @%p2 bra $L__BB1_1;
-; SM30-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM30-NEXT:    @%p2 bra $L__BB9_1;
+; SM30-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
+; SM30-NEXT:    membar.sys;
 ; SM30-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM30-NEXT:    ret;
 ;
-; SM70-LABEL: relaxed_sys_i16(
+; SM70-LABEL: seq_cst_sys_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
 ; SM70-NEXT:    .reg .b16 %rs<2>;
@@ -144,10 +1214,11 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
-; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM70-NEXT:    ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
 ; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
-; SM70-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
 ; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
 ; SM70-NEXT:    and.b32 %r11, %r10, 3;
 ; SM70-NEXT:    shl.b32 %r1, %r11, 3;
@@ -159,23 +1230,65 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
 ; SM70-NEXT:    ld.u32 %r15, [%rd1];
 ; SM70-NEXT:    and.b32 %r19, %r15, %r2;
-; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
+; SM70-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
 ; SM70-NEXT:    or.b32 %r16, %r19, %r3;
 ; SM70-NEXT:    or.b32 %r17, %r19, %r4;
-; SM70-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
 ; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
-; SM70-NEXT:    @%p1 bra $L__BB1_3;
+; SM70-NEXT:    @%p1 bra $L__BB9_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; SM70-NEXT:    // in Loop: Header=BB9_1 Depth=1
 ; SM70-NEXT:    and.b32 %r8, %r7, %r2;
 ; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
 ; SM70-NEXT:    mov.u32 %r19, %r8;
-; SM70-NEXT:    @%p2 bra $L__BB1_1;
-; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM70-NEXT:    @%p2 bra $L__BB9_1;
+; SM70-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
+; SM70-NEXT:    fence.acq_rel.sys;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-  %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
+; SM90-LABEL: seq_cst_sys_i16(
+; SM90:       {
+; SM90-NEXT:    .reg .pred %p<3>;
+; SM90-NEXT:    .reg .b16 %rs<2>;
+; SM90-NEXT:    .reg .b32 %r<20>;
+; SM90-NEXT:    .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
+; SM90-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM90-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT:    and.b32 %r11, %r10, 3;
+; SM90-NEXT:    shl.b32 %r1, %r11, 3;
+; SM90-NEXT:    mov.b32 %r12, 65535;
+; SM90-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM90-NEXT:    not.b32 %r2, %r13;
+; SM90-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM90-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM90-NEXT:    ld.u32 %r15, [%rd1];
+; SM90-NEXT:    and.b32 %r19, %r15, %r2;
+; SM90-NEXT:  $L__BB9_1: // %partword.cmpxchg.loop
+; SM90-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM90-NEXT:    or.b32 %r16, %r19, %r3;
+; SM90-NEXT:    or.b32 %r17, %r19, %r4;
+; SM90-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT:    @%p1 bra $L__BB9_3;
+; SM90-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT:    // in Loop: Header=BB9_1 Depth=1
+; SM90-NEXT:    and.b32 %r8, %r7, %r2;
+; SM90-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT:    mov.u32 %r19, %r8;
+; SM90-NEXT:    @%p2 bra $L__BB9_1;
+; SM90-NEXT:  $L__BB9_3: // %partword.cmpxchg.end
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r14;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
   ret i16 %new
 }
 
@@ -203,13 +1316,197 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
 ; SM70-NEXT:    ld.param.u32 %r1, [relaxed_sys_i32_param_1];
 ; SM70-NEXT:    ld.param.u32 %r2, [relaxed_sys_i32_param_2];
-; SM70-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
+; SM90-LABEL: relaxed_sys_i32(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [relaxed_sys_i32_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [relaxed_sys_i32_param_2];
+; SM90-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
   %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
   ret i32 %new
 }
 
+define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
+; SM30-LABEL: acq_rel_sys_i32(
+; SM30:       {
+; SM30-NEXT:    .reg .b32 %r<4>;
+; SM30-NEXT:    .reg .b64 %rd<2>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
+; SM30-NEXT:    ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
+; SM30-NEXT:    ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
+; SM30-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: acq_rel_sys_i32(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
+; SM70-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+; SM90-LABEL: acq_rel_sys_i32(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
+; SM90-NEXT:    atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
+  ret i32 %new
+}
+
+define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
+; SM30-LABEL: acquire_sys_i32(
+; SM30:       {
+; SM30-NEXT:    .reg .b32 %r<4>;
+; SM30-NEXT:    .reg .b64 %rd<2>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [acquire_sys_i32_param_0];
+; SM30-NEXT:    ld.param.u32 %r1, [acquire_sys_i32_param_1];
+; SM30-NEXT:    ld.param.u32 %r2, [acquire_sys_i32_param_2];
+; SM30-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: acquire_sys_i32(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_sys_i32_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [acquire_sys_i32_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [acquire_sys_i32_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+; SM90-LABEL: acquire_sys_i32(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_sys_i32_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [acquire_sys_i32_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [acquire_sys_i32_param_2];
+; SM90-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
+  ret i32 %new
+}
+
+define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
+; SM30-LABEL: release_sys_i32(
+; SM30:       {
+; SM30-NEXT:    .reg .b32 %r<4>;
+; SM30-NEXT:    .reg .b64 %rd<2>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [release_sys_i32_param_0];
+; SM30-NEXT:    ld.param.u32 %r1, [release_sys_i32_param_1];
+; SM30-NEXT:    ld.param.u32 %r2, [release_sys_i32_param_2];
+; SM30-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: release_sys_i32(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_sys_i32_param_0];
+; SM70-NEXT:    ld.param.u32 %r1, [release_sys_i32_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [release_sys_i32_param_2];
+; SM70-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+; SM90-LABEL: release_sys_i32(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_sys_i32_param_0];
+; SM90-NEXT:    ld.param.u32 %r1, [release_sys_i32_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [release_sys_i32_param_2];
+; SM90-NEXT:    atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
+  ret i32 %new
+}
+
+define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
+; SM30-LABEL: seq_cst_sys_i32(
+; SM30:       {
+; SM30-NEXT:    .reg .b32 %r<4>;
+; SM30-NEXT:    .reg .b64 %rd<2>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
+; SM30-NEXT:    ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
+; SM30-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM30-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: seq_cst_sys_i32(
+; SM70:       {
+; SM70-NEXT:    .reg .b32 %r<4>;
+; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
+; SM70-NEXT:    ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
+; SM70-NEXT:    atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    ret;
+; SM90-LABEL: seq_cst_sys_i32(
+; SM90:       {
+; SM90-NEXT:    .reg .b32 %r<4>;
+; SM90-NEXT:    .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
+; SM90-NEXT:    ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
+; SM90-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+  ret i32 %new
+}
+
 ; CHECK-LABEL: relaxed_sys_i64
 define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM30-LABEL: relaxed_sys_i64(
@@ -232,11 +1529,183 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
 ; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
 ; SM70-NEXT:    ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
-; SM70-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
+; SM90-LABEL: relaxed_sys_i64(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
+; SM90-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
   %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
   ret i64 %new
 }
+
+define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
+; SM30-LABEL: acquire_sys_i64(
+; SM30:       {
+; SM30-NEXT:    .reg .b64 %rd<5>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [acquire_sys_i64_param_0];
+; SM30-NEXT:    ld.param.u64 %rd2, [acquire_sys_i64_param_1];
+; SM30-NEXT:    ld.param.u64 %rd3, [acquire_sys_i64_param_2];
+; SM30-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM30-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: acquire_sys_i64(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acquire_sys_i64_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acquire_sys_i64_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acquire_sys_i64_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+; SM90-LABEL: acquire_sys_i64(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acquire_sys_i64_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acquire_sys_i64_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acquire_sys_i64_param_2];
+; SM90-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+  ret i64 %new
+}
+
+define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
+; SM30-LABEL: acq_rel_sys_i64(
+; SM30:       {
+; SM30-NEXT:    .reg .b64 %rd<5>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
+; SM30-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
+; SM30-NEXT:    ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
+; SM30-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM30-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: acq_rel_sys_i64(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
+; SM70-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+; SM90-LABEL: acq_rel_sys_i64(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
+; SM90-NEXT:    atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+  ret i64 %new
+}
+
+define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
+; SM30-LABEL: release_sys_i64(
+; SM30:       {
+; SM30-NEXT:    .reg .b64 %rd<5>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [release_sys_i64_param_0];
+; SM30-NEXT:    ld.param.u64 %rd2, [release_sys_i64_param_1];
+; SM30-NEXT:    ld.param.u64 %rd3, [release_sys_i64_param_2];
+; SM30-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM30-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: release_sys_i64(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [release_sys_i64_param_0];
+; SM70-NEXT:    ld.param.u64 %rd2, [release_sys_i64_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [release_sys_i64_param_2];
+; SM70-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+; SM90-LABEL: release_sys_i64(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [release_sys_i64_param_0];
+; SM90-NEXT:    ld.param.u64 %rd2, [release_sys_i64_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [release_sys_i64_param_2];
+; SM90-NEXT:    atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+  ret i64 %new
+}
+
+define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
+; SM30-LABEL: seq_cst_sys_i64(
+; SM30:       {
+; SM30-NEXT:    .reg .b64 %rd<5>;
+; SM30-EMPTY:
+; SM30-NEXT:  // %bb.0:
+; SM30-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
+; SM30-NEXT:    membar.sys;
+; SM30-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
+; SM30-NEXT:    ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
+; SM30-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM30-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM30-NEXT:    ret;
+;
+; SM70-LABEL: seq_cst_sys_i64(
+; SM70:       {
+; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
+; SM70-NEXT:    fence.sc.sys;
+; SM70-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
+; SM70-NEXT:    ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
+; SM70-NEXT:    atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT:    ret;
+; SM90-LABEL: seq_cst_sys_i64(
+; SM90:       {
+; SM90-NEXT:    .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT:  // %bb.0:
+; SM90-NEXT:    ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
+; SM90-NEXT:    fence.sc.sys;
+; SM90-NEXT:    ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
+; SM90-NEXT:    ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
+; SM90-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT:    fence.acquire.sys;
+; SM90-NEXT:    st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT:    ret;
+  %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+  ret i64 %new
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py
new file mode 100644
index 000000000000000..bf787406b76c01f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py
@@ -0,0 +1,46 @@
+# For manual usage, not as a part of lit tests. Used for generating the following tests:
+# cmpxchg-sm30.ll, cmpxchg-sm70.ll, cmpxchg-sm90.ll
+
+from string import Template
+from itertools import product
+
+cmpxchg_func = Template(
+"""define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
+    %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure
+    ret i$size %new
+}
+"""
+)
+
+run_statement = Template(
+    """; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %}
+"""
+)
+
+TESTS = [(60, 50), (70, 63), (90, 87)]
+
+LLVM_SCOPES = ["", "block", "cluster", "device"]
+
+SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"}
+
+SUCCESS_ORDERINGS = ["monotonic", "acquire", "release", "acq_rel", "seq_cst"]
+
+FAILURE_ORDERINGS = ["monotonic", "acquire", "seq_cst"]
+
+SIZES = [8, 16, 32, 64]
+
+ADDRSPACES = [0, 1, 3]
+
+ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"}
+
+if __name__ == "__main__":
+    for sm, ptx in TESTS:
+        with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp:
+            print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
+            for size, success, failure, addrspace in product(SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES):
+                if addrspace == 0:
+                    addrspace_cast = ""
+                else:
+                    addrspace_cast = " addrspace({})".format(str(addrspace))
+                print(cmpxchg_func.substitute(success=success, failure=failure, size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], addrspace_cast=addrspace_cast), file=fp)
diff --git a/llvm/test/CodeGen/NVPTX/lit.local.cfg b/llvm/test/CodeGen/NVPTX/lit.local.cfg
index 54a6c338bdf85af..84cce669ec10b48 100644
--- a/llvm/test/CodeGen/NVPTX/lit.local.cfg
+++ b/llvm/test/CodeGen/NVPTX/lit.local.cfg
@@ -1,4 +1,4 @@
 if not "NVPTX" in config.root.targets:
     config.unsupported = True
 config.suffixes.add(".py")
-config.excludes = ["fence.py"]
+config.excludes = ["fence.py", "cmpxchg.py"]

>From 87d17e5da7c516ae43d9396fabad0c266d3b1903 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Fri, 7 Feb 2025 00:02:54 +0000
Subject: [PATCH 2/3] clang-format, black

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 19 +++++++++++--------
 llvm/test/CodeGen/NVPTX/cmpxchg.py          | 17 ++++++++++++++---
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a74eada0afcebab..db5f45fb7d09c85 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5603,9 +5603,9 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
   if (isa<AtomicCmpXchgInst>(Inst)) {
     // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
     if (isReleaseOrStronger(Ord))
-      return Ord == AtomicOrdering::SequentiallyConsistent ?
-             Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) :
-             Builder.CreateFence(AtomicOrdering::Release);
+      return Ord == AtomicOrdering::SequentiallyConsistent
+                 ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
+                 : Builder.CreateFence(AtomicOrdering::Release);
   } else {
     return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
   }
@@ -5617,13 +5617,16 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
                                                     AtomicOrdering Ord) const {
   // Specialize for cmpxchg
   if (isa<AtomicCmpXchgInst>(Inst)) {
-    auto CASWidth= cast<IntegerType>(dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())->getBitWidth();
+    auto CASWidth =
+        cast<IntegerType>(
+            dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())
+            ->getBitWidth();
     // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
     if (isAcquireOrStronger(Ord))
-      return (Ord == AtomicOrdering::SequentiallyConsistent
-              && CASWidth >= STI.getMinCmpXchgSizeInBits()) ?
-              nullptr :
-              Builder.CreateFence(AtomicOrdering::Acquire);
+      return (Ord == AtomicOrdering::SequentiallyConsistent &&
+              CASWidth >= STI.getMinCmpXchgSizeInBits())
+                 ? nullptr
+                 : Builder.CreateFence(AtomicOrdering::Acquire);
   } else {
     return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
   }
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py
index bf787406b76c01f..ae7450015ecd2eb 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.py
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py
@@ -5,7 +5,7 @@
 from itertools import product
 
 cmpxchg_func = Template(
-"""define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
+    """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
     %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure
     ret i$size %new
 }
@@ -38,9 +38,20 @@
     for sm, ptx in TESTS:
         with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp:
             print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
-            for size, success, failure, addrspace in product(SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES):
+            for size, success, failure, addrspace in product(
+                SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES
+            ):
                 if addrspace == 0:
                     addrspace_cast = ""
                 else:
                     addrspace_cast = " addrspace({})".format(str(addrspace))
-                print(cmpxchg_func.substitute(success=success, failure=failure, size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], addrspace_cast=addrspace_cast), file=fp)
+                print(
+                    cmpxchg_func.substitute(
+                        success=success,
+                        failure=failure,
+                        size=size,
+                        addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+                        addrspace_cast=addrspace_cast,
+                    ),
+                    file=fp,
+                )

>From e6949e95fca01fce6726778e32aa49b8afcd8629 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Fri, 7 Feb 2025 01:37:58 +0000
Subject: [PATCH 3/3] fix tests: SM < 70 does not support memory order, emit
 explicit atom.cas.relaxed for monotonic

---
 llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 8 ++++----
 llvm/test/CodeGen/NVPTX/atomics.ll      | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 67abfe8295a6231..9027bd6a1478053 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r30, %r29, %r2;
 ; CHECKPTX71-NEXT:    and.b32 %r31, %r54, %r3;
 ; CHECKPTX71-NEXT:    or.b32 %r32, %r31, %r30;
-; CHECKPTX71-NEXT:    atom.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX71-NEXT:    atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32;
 ; CHECKPTX71-NEXT:    setp.ne.s32 %p1, %r6, %r54;
 ; CHECKPTX71-NEXT:    mov.u32 %r54, %r6;
 ; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
@@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r2;
 ; CHECKPTX71-NEXT:    and.b32 %r36, %r55, %r3;
 ; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT:    atom.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX71-NEXT:    atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37;
 ; CHECKPTX71-NEXT:    setp.ne.s32 %p2, %r9, %r55;
 ; CHECKPTX71-NEXT:    mov.u32 %r55, %r9;
 ; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
@@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
 ; CHECKPTX71-NEXT:    and.b32 %r44, %r56, %r12;
 ; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT:    atom.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX71-NEXT:    atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45;
 ; CHECKPTX71-NEXT:    setp.ne.s32 %p3, %r15, %r56;
 ; CHECKPTX71-NEXT:    mov.u32 %r56, %r15;
 ; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
@@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    shl.b32 %r51, %r50, %r17;
 ; CHECKPTX71-NEXT:    and.b32 %r52, %r57, %r18;
 ; CHECKPTX71-NEXT:    or.b32 %r53, %r52, %r51;
-; CHECKPTX71-NEXT:    atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX71-NEXT:    atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53;
 ; CHECKPTX71-NEXT:    setp.ne.s32 %p4, %r21, %r57;
 ; CHECKPTX71-NEXT:    mov.u32 %r57, %r21;
 ; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index ac2bd8828b20579..fccc3a8844ffc4f 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -177,7 +177,7 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
 
 ; CHECK-LABEL: atomicrmw_add_f16_generic
 define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
-; CHECK: atom.acquire.cas
+; CHECK: atom.cas
   %ret = atomicrmw fadd ptr %addr, half %val seq_cst
   ret half %ret
 }
@@ -198,14 +198,14 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
 
 ; CHECK-LABEL: atomic_cmpxchg_i32
 define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
-; CHECK: atom.acquire.cas.b32
+; CHECK: atom.cas.b32
   %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
   ret i32 %new
 }
 
 ; CHECK-LABEL: atomic_cmpxchg_i64
 define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) {
-; CHECK: atom.acquire.cas.b64
+; CHECK: atom.cas.b64
   %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
   ret i64 %new
 }



More information about the llvm-commits mailing list